Example #1
0
def main():
    arg_parser = ArgumentParser(description=DESCRIPTION)
    arg_parser.add_argument("--external_id", required=True)
    args = arg_parser.parse_args()
    external_id = load(args.external_id)
    session = DrmaaSessionFactory().get()
    external_id = session.kill(external_id)
Example #2
0
    def __init__(self, app, nworkers, **kwargs):
        """Start the job runner"""
        global drmaa

        runner_param_specs = {
            'drmaa_library_path': dict(map=str, default=os.environ.get('DRMAA_LIBRARY_PATH', None))}
        for retry_exception in RETRY_EXCEPTIONS_LOWER:
            runner_param_specs[retry_exception + '_state'] = dict(map=str, valid=lambda x: x in (model.Job.states.OK, model.Job.states.ERROR), default=model.Job.states.OK)
            runner_param_specs[retry_exception + '_retries'] = dict(map=int, valid=lambda x: int(x) >= 0, default=0)

        if 'runner_param_specs' not in kwargs:
            kwargs['runner_param_specs'] = dict()
        kwargs['runner_param_specs'].update(runner_param_specs)

        super(DRMAAJobRunner, self).__init__(app, nworkers, **kwargs)

        # This allows multiple drmaa runners (although only one per handler) in the same job config file
        if 'drmaa_library_path' in kwargs:
            log.info('Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s', self.runner_params.drmaa_library_path)
            os.environ['DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path

        # Import is delayed until runner initialization to allow for the
        # drmaa_library_path plugin param to override $DRMAA_LIBRARY_PATH
        try:
            drmaa = __import__("drmaa")
        except (ImportError, RuntimeError) as exc:
            raise exc.__class__('The Python drmaa package is required to use this '
                                'feature, please install it or correct the '
                                'following error:\n%s: %s' %
                                (exc.__class__.__name__, str(exc)))
        from pulsar.managers.util.drmaa import DrmaaSessionFactory

        # make the drmaa library also available to subclasses
        self.drmaa = drmaa

        # Subclasses may need access to state constants
        self.drmaa_job_states = drmaa.JobState

        # Descriptive state strings pulled from the drmaa lib itself
        self.drmaa_job_state_strings = {
            drmaa.JobState.UNDETERMINED: 'process status cannot be determined',
            drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active',
            drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold',
            drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold',
            drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold',
            drmaa.JobState.RUNNING: 'job is running',
            drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended',
            drmaa.JobState.USER_SUSPENDED: 'job is user suspended',
            drmaa.JobState.DONE: 'job finished normally',
            drmaa.JobState.FAILED: 'job finished, but failed',
        }

        # Ensure a DRMAA session exists and is initialized
        self.ds = DrmaaSessionFactory().get()

        self.userid = None

        self._init_monitor_thread()
        self._init_worker_threads()
        self.redact_email_in_job_name = self.app.config.redact_email_in_job_name
Example #3
0
def main():
    arg_parser = ArgumentParser(description=DESCRIPTION)
    arg_parser.add_argument("--external_id", required=True)
    args = arg_parser.parse_args()
    external_id = load(args.external_id)
    session = DrmaaSessionFactory().get()
    external_id = session.kill(external_id)
Example #4
0
def main():
    arg_parser = ArgumentParser(description=DESCRIPTION)
    arg_parser.add_argument("--job_attributes", required=True)
    args = arg_parser.parse_args()
    job_attributes = load(open(args.job_attributes, "r"))
    session = DrmaaSessionFactory().get()
    external_id = session.run_job(**job_attributes)
    print(external_id)
Example #5
0
    def __init__(self, app, nworkers, **kwargs):
        """Start the job runner"""
        global drmaa

        runner_param_specs = {
            'drmaa_library_path': dict(map=str, default=os.environ.get('DRMAA_LIBRARY_PATH', None))}
        for retry_exception in RETRY_EXCEPTIONS_LOWER:
            runner_param_specs[retry_exception + '_state'] = dict(map=str, valid=lambda x: x in (model.Job.states.OK, model.Job.states.ERROR), default=model.Job.states.OK)
            runner_param_specs[retry_exception + '_retries'] = dict(map=int, valid=lambda x: int >= 0, default=0)

        if 'runner_param_specs' not in kwargs:
            kwargs['runner_param_specs'] = dict()
        kwargs['runner_param_specs'].update(runner_param_specs)

        super(DRMAAJobRunner, self).__init__(app, nworkers, **kwargs)

        # This allows multiple drmaa runners (although only one per handler) in the same job config file
        if 'drmaa_library_path' in kwargs:
            log.info('Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s', self.runner_params.drmaa_library_path)
            os.environ['DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path

        # Import is delayed until runner initialization to allow for the
        # drmaa_library_path plugin param to override $DRMAA_LIBRARY_PATH
        try:
            drmaa = __import__("drmaa")
        except (ImportError, RuntimeError) as exc:
            raise exc.__class__('The Python drmaa package is required to use this '
                                'feature, please install it or correct the '
                                'following error:\n%s: %s' %
                                (exc.__class__.__name__, str(exc)))
        from pulsar.managers.util.drmaa import DrmaaSessionFactory

        # make the drmaa library also available to subclasses
        self.drmaa = drmaa

        # Subclasses may need access to state constants
        self.drmaa_job_states = drmaa.JobState

        # Descriptive state strings pulled from the drmaa lib itself
        self.drmaa_job_state_strings = {
            drmaa.JobState.UNDETERMINED: 'process status cannot be determined',
            drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active',
            drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold',
            drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold',
            drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold',
            drmaa.JobState.RUNNING: 'job is running',
            drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended',
            drmaa.JobState.USER_SUSPENDED: 'job is user suspended',
            drmaa.JobState.DONE: 'job finished normally',
            drmaa.JobState.FAILED: 'job finished, but failed',
        }

        # Ensure a DRMAA session exists and is initialized
        self.ds = DrmaaSessionFactory().get()

        self.userid = None

        self._init_monitor_thread()
        self._init_worker_threads()
        self.redact_email_in_job_name = self.app.config.redact_email_in_job_name
Example #6
0
    def test_restart_finishes_job(self):
        test = "restart_finishes"
        with self._setup_app_provider(test) as app_provider:
            job_id = '12345'

            with app_provider.new_app() as app:
                manager = app.only_manager
                job_info = {
                    'job_id': job_id,
                    'command_line': 'sleep 1000',
                    'setup': True,
                }
                submit_job(manager, job_info)
                external_id = None
                for i in range(10):
                    time.sleep(.05)
                    # TODO: unfortunate breaking of abstractions here.
                    external_id = manager._proxied_manager._external_id(job_id)
                    if external_id:
                        break
                if external_id is None:
                    assert False, "Test failed, couldn't get exteranl id for job id."

            drmaa_session = DrmaaSessionFactory().get()
            drmaa_session.kill(external_id)
            drmaa_session.close()
            consumer = self._status_update_consumer(test)
            consumer.start()

            with app_provider.new_app() as app:
                consumer.wait_for_messages()

            consumer.join()
            assert len(consumer.messages) == 1, len(consumer.messages)
            assert consumer.messages[0]["status"] == "complete"
Example #7
0
class DRMAAJobRunner( AsynchronousJobRunner ):
    """
    Job runner backed by a finite pool of worker threads. FIFO scheduling
    """
    runner_name = "DRMAARunner"
    restrict_job_name_length = 15

    def __init__( self, app, nworkers, **kwargs ):
        """Start the job runner"""
        global drmaa

        runner_param_specs = {
            'drmaa_library_path': dict( map=str, default=os.environ.get( 'DRMAA_LIBRARY_PATH', None ) ) }
        for retry_exception in RETRY_EXCEPTIONS_LOWER:
            runner_param_specs[retry_exception + '_state'] = dict( map=str, valid=lambda x: x in ( model.Job.states.OK, model.Job.states.ERROR ), default=model.Job.states.OK )
            runner_param_specs[retry_exception + '_retries'] = dict( map=int, valid=lambda x: int >= 0, default=0 )

        if 'runner_param_specs' not in kwargs:
            kwargs[ 'runner_param_specs' ] = dict()
        kwargs[ 'runner_param_specs' ].update( runner_param_specs )

        super( DRMAAJobRunner, self ).__init__( app, nworkers, **kwargs )

        # This allows multiple drmaa runners (although only one per handler) in the same job config file
        if 'drmaa_library_path' in kwargs:
            log.info( 'Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s', self.runner_params.drmaa_library_path )
            os.environ['DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path

        # Import is delayed until runner initialization to allow for the
        # drmaa_library_path plugin param to override $DRMAA_LIBRARY_PATH
        try:
            drmaa = __import__( "drmaa" )
        except (ImportError, RuntimeError) as exc:
            raise exc.__class__('The Python drmaa package is required to use this '
                                'feature, please install it or correct the '
                                'following error:\n%s: %s' %
                                (exc.__class__.__name__, str(exc)))
        from pulsar.managers.util.drmaa import DrmaaSessionFactory

        # Subclasses may need access to state constants
        self.drmaa_job_states = drmaa.JobState

        # Descriptive state strings pulled from the drmaa lib itself
        self.drmaa_job_state_strings = {
            drmaa.JobState.UNDETERMINED: 'process status cannot be determined',
            drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active',
            drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold',
            drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold',
            drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold',
            drmaa.JobState.RUNNING: 'job is running',
            drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended',
            drmaa.JobState.USER_SUSPENDED: 'job is user suspended',
            drmaa.JobState.DONE: 'job finished normally',
            drmaa.JobState.FAILED: 'job finished, but failed',
        }

        # Ensure a DRMAA session exists and is initialized
        self.ds = DrmaaSessionFactory().get()

        self.userid = None

        self._init_monitor_thread()
        self._init_worker_threads()

    def url_to_destination(self, url):
        """Convert a legacy URL to a job destination"""
        if not url:
            return
        native_spec = url.split('/')[2]
        if native_spec:
            params = dict( nativeSpecification=native_spec )
            log.debug( "Converted URL '%s' to destination runner=drmaa, params=%s" % ( url, params ) )
            return JobDestination( runner='drmaa', params=params )
        else:
            log.debug( "Converted URL '%s' to destination runner=drmaa" % url )
            return JobDestination( runner='drmaa' )

    def get_native_spec( self, url ):
        """Get any native DRM arguments specified by the site configuration"""
        try:
            return url.split('/')[2] or None
        except:
            return None

    def queue_job( self, job_wrapper ):
        """Create job script and submit it to the DRM"""
        # prepare the job

        # external_runJob_script can be None, in which case it's not used.
        external_runjob_script = job_wrapper.get_destination_configuration("drmaa_external_runjob_script", None)

        include_metadata = asbool( job_wrapper.job_destination.params.get( "embed_metadata_in_job", True) )
        if not self.prepare_job( job_wrapper, include_metadata=include_metadata):
            return

        # get configured job destination
        job_destination = job_wrapper.job_destination

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        job_name = self._job_name(job_wrapper)
        ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name )

        # set up the drmaa job template
        jt = dict(
            remoteCommand=ajs.job_file,
            jobName=ajs.job_name,
            workingDirectory=job_wrapper.working_directory,
            outputPath=":%s" % ajs.output_file,
            errorPath=":%s" % ajs.error_file
        )

        # Avoid a jt.exitCodePath for now - it's only used when finishing.
        native_spec = job_destination.params.get('nativeSpecification', None)
        if native_spec is not None:
            jt['nativeSpecification'] = native_spec

        # fill in the DRM's job run template
        script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file)
        try:
            self.write_executable_script( ajs.job_file, script )
        except:
            job_wrapper.fail( "failure preparing job script", exception=True )
            log.exception( "(%s) failure writing job script" % galaxy_id_tag )
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug( "(%s) Job deleted by user before it entered the queue" % galaxy_id_tag )
            if job_wrapper.cleanup_job in ( "always", "onsuccess" ):
                job_wrapper.cleanup()
            return

        log.debug( "(%s) submitting file %s", galaxy_id_tag, ajs.job_file )
        if native_spec:
            log.debug( "(%s) native specification is: %s", galaxy_id_tag, native_spec )

        # runJob will raise if there's a submit problem
        if external_runjob_script is None:
            # TODO: create a queue for retrying submission indefinitely
            # TODO: configurable max tries and sleep
            trynum = 0
            external_job_id = None
            fail_msg = None
            while external_job_id is None and trynum < 5:
                try:
                    external_job_id = self.ds.run_job(**jt)
                    break
                except ( drmaa.InternalException, drmaa.DeniedByDrmException ) as e:
                    trynum += 1
                    log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e )
                    fail_msg = "Unable to run this job due to a cluster error, please retry it later"
                    time.sleep( 5 )
                except:
                    log.exception( '(%s) drmaa.Session.runJob() failed unconditionally', galaxy_id_tag )
                    trynum = 5
            else:
                log.error( "(%s) All attempts to submit job failed" % galaxy_id_tag )
                if not fail_msg:
                    fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE
                job_wrapper.fail( fail_msg )
                return
        else:
            job_wrapper.change_ownership_for_run()
            # if user credentials are not available, use galaxy credentials (if permitted)
            allow_guests = asbool(job_wrapper.job_destination.params.get( "allow_guests", False) )
            pwent = job_wrapper.user_system_pwent
            if pwent is None:
                if not allow_guests:
                    fail_msg = "User %s is not mapped to any real user, and not permitted to start jobs." % job_wrapper.user
                    job_wrapper.fail( fail_msg )
                    return
                pwent = job_wrapper.galaxy_system_pwent
            log.debug( '(%s) submitting with credentials: %s [uid: %s]' % ( galaxy_id_tag, pwent[0], pwent[2] ) )
            filename = self.store_jobtemplate(job_wrapper, jt)
            self.userid = pwent[2]
            external_job_id = self.external_runjob(external_runjob_script, filename, pwent[2]).strip()
        log.info( "(%s) queued as %s" % ( galaxy_id_tag, external_job_id ) )

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_job_destination( job_destination, external_job_id )

        # Store DRM related state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( ajs )

    def _complete_terminal_job( self, ajs, drmaa_state, **kwargs ):
        """
        Handle a job upon its termination in the DRM. This method is meant to
        be overridden by subclasses to improve post-mortem and reporting of
        failures.
        Returns True if job was not actually terminal, None otherwise.
        """
        if drmaa_state == drmaa.JobState.FAILED:
            if ajs.job_wrapper.get_state() != model.Job.states.DELETED:
                ajs.stop_job = False
                ajs.fail_message = "The cluster DRM system terminated this job"
                self.work_queue.put( ( self.fail_job, ajs ) )
        elif drmaa_state == drmaa.JobState.DONE:
            # External metadata processing for external runjobs
            external_metadata = not asbool( ajs.job_wrapper.job_destination.params.get( "embed_metadata_in_job", True) )
            if external_metadata:
                self._handle_metadata_externally( ajs.job_wrapper, resolve_requirements=True )
            if ajs.job_wrapper.get_state() != model.Job.states.DELETED:
                self.work_queue.put( ( self.finish_job, ajs ) )

    def check_watched_items( self ):
        """
        Called by the monitor thread to look at each watched job and deal
        with state changes.
        """
        new_watched = []
        for ajs in self.watched:
            external_job_id = ajs.job_id
            galaxy_id_tag = ajs.job_wrapper.get_id_tag()
            old_state = ajs.old_state
            try:
                assert external_job_id not in ( None, 'None' ), '(%s/%s) Invalid job id' % ( galaxy_id_tag, external_job_id )
                state = self.ds.job_status( external_job_id )
                # Reset exception retries
                for retry_exception in RETRY_EXCEPTIONS_LOWER:
                    setattr( ajs, retry_exception + '_retries', 0)
            except ( drmaa.InternalException, drmaa.InvalidJobException ) as e:
                ecn = type(e).__name__
                retry_param = ecn.lower() + '_retries'
                state_param = ecn.lower() + '_state'
                retries = getattr( ajs, retry_param, 0 )
                log.warning("(%s/%s) unable to check job status because of %s exception for %d consecutive tries: %s", galaxy_id_tag, external_job_id, ecn, retries + 1, e)
                if self.runner_params[ retry_param ] > 0:
                    if retries < self.runner_params[ retry_param ]:
                        # will retry check on next iteration
                        setattr( ajs, retry_param, retries + 1 )
                        new_watched.append( ajs )
                        continue
                if self.runner_params[ state_param ] == model.Job.states.OK:
                    log.warning( "(%s/%s) job will now be finished OK", galaxy_id_tag, external_job_id )
                    self.work_queue.put( ( self.finish_job, ajs ) )
                elif self.runner_params[ state_param ] == model.Job.states.ERROR:
                    log.warning( "(%s/%s) job will now be errored", galaxy_id_tag, external_job_id )
                    self.work_queue.put( ( self.fail_job, ajs ) )
                else:
                    raise Exception( "%s is set to an invalid value (%s), this should not be possible. See galaxy.jobs.drmaa.__init__()", state_param, self.runner_params[ state_param ] )
                continue
            except drmaa.DrmCommunicationException as e:
                log.warning( "(%s/%s) unable to communicate with DRM: %s", galaxy_id_tag, external_job_id, e )
                new_watched.append( ajs )
                continue
            except Exception as e:
                # so we don't kill the monitor thread
                log.exception( "(%s/%s) unable to check job status: %s" % ( galaxy_id_tag, external_job_id, e ) )
                log.warning( "(%s/%s) job will now be errored" % ( galaxy_id_tag, external_job_id ) )
                ajs.fail_message = "Cluster could not complete job"
                self.work_queue.put( ( self.fail_job, ajs ) )
                continue
            if state != old_state:
                log.debug( "(%s/%s) state change: %s" % ( galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state] ) )
            if state == drmaa.JobState.RUNNING and not ajs.running:
                ajs.running = True
                ajs.job_wrapper.change_state( model.Job.states.RUNNING )
            if state in ( drmaa.JobState.FAILED, drmaa.JobState.DONE ):
                if self._complete_terminal_job( ajs, drmaa_state=state ) is not None:
                    # job was not actually terminal
                    state = ajs.old_state
                else:
                    continue
            if ajs.check_limits():
                self.work_queue.put( ( self.fail_job, ajs ) )
                continue
            ajs.old_state = state
            new_watched.append( ajs )
        # Replace the watch list with the updated version
        self.watched = new_watched

    def stop_job( self, job ):
        """Attempts to delete a job from the DRM queue"""
        try:
            ext_id = job.get_job_runner_external_id()
            assert ext_id not in ( None, 'None' ), 'External job id is None'
            kill_script = job.get_destination_configuration(self.app.config, "drmaa_external_killjob_script", None)
            if kill_script is None:
                self.ds.kill( ext_id )
            else:
                # FIXME: hardcoded path
                subprocess.Popen( [ '/usr/bin/sudo', '-E', kill_script, str( ext_id ), str( self.userid ) ], shell=False )
            log.debug( "(%s/%s) Removed from DRM queue at user's request" % ( job.get_id(), ext_id ) )
        except drmaa.InvalidJobException:
            log.debug( "(%s/%s) User killed running job, but it was already dead" % ( job.get_id(), ext_id ) )
        except Exception as e:
            log.debug( "(%s/%s) User killed running job, but error encountered removing from DRM queue: %s" % ( job.get_id(), ext_id, e ) )

    def recover( self, job, job_wrapper ):
        """Recovers jobs stuck in the queued/running state when Galaxy started"""
        job_id = job.get_job_runner_external_id()
        if job_id is None:
            self.put( job_wrapper )
            return
        ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper )
        ajs.job_id = str( job_id )
        ajs.command_line = job.get_command_line()
        ajs.job_wrapper = job_wrapper
        ajs.job_destination = job_wrapper.job_destination
        if job.state == model.Job.states.RUNNING:
            log.debug( "(%s/%s) is still in running state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) )
            ajs.old_state = drmaa.JobState.RUNNING
            ajs.running = True
            self.monitor_queue.put( ajs )
        elif job.get_state() == model.Job.states.QUEUED:
            log.debug( "(%s/%s) is still in DRM queued state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) )
            ajs.old_state = drmaa.JobState.QUEUED_ACTIVE
            ajs.running = False
            self.monitor_queue.put( ajs )

    def store_jobtemplate(self, job_wrapper, jt):
        """ Stores the content of a DRMAA JobTemplate object in a file as a JSON string.
        Path is hard-coded, but it's no worse than other path in this module.
        Uses Galaxy's JobID, so file is expected to be unique."""
        filename = "%s/%s.jt_json" % (self.app.config.cluster_files_directory, job_wrapper.get_id_tag())
        with open(filename, 'w+') as fp:
            json.dump(jt, fp)
        log.debug( '(%s) Job script for external submission is: %s' % ( job_wrapper.job_id, filename ) )
        return filename

    def external_runjob(self, external_runjob_script, jobtemplate_filename, username):
        """ runs an external script the will QSUB a new job.
        The external script will be run with sudo, and will setuid() to the specified user.
        Effectively, will QSUB as a different user (then the one used by Galaxy).
        """
        script_parts = external_runjob_script.split()
        script = script_parts[0]
        command = [ '/usr/bin/sudo', '-E', script]
        for script_argument in script_parts[1:]:
            command.append(script_argument)

        command.extend( [ str(username), jobtemplate_filename ] )
        log.info("Running command %s" % command)
        p = subprocess.Popen(command,
                             shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        (stdoutdata, stderrdata) = p.communicate()
        exitcode = p.returncode
        # os.unlink(jobtemplate_filename)
        if exitcode != 0:
            # There was an error in the child process
            raise RuntimeError("External_runjob failed (exit code %s)\nChild process reported error:\n%s" % (str(exitcode), stderrdata))
        if not stdoutdata.strip():
            raise RuntimeError("External_runjob did return the job id: %s" % (stdoutdata))

        # The expected output is a single line containing a single numeric value:
        # the DRMAA job-ID. If not the case, will throw an error.
        jobId = stdoutdata
        return jobId

    def _job_name(self, job_wrapper):
        external_runjob_script = job_wrapper.get_destination_configuration("drmaa_external_runjob_script", None)
        galaxy_id_tag = job_wrapper.get_id_tag()

        # define job attributes
        job_name = 'g%s' % galaxy_id_tag
        if job_wrapper.tool.old_id:
            job_name += '_%s' % job_wrapper.tool.old_id
        if external_runjob_script is None:
            job_name += '_%s' % job_wrapper.user
        job_name = ''.join( map( lambda x: x if x in ( string.letters + string.digits + '_' ) else '_', job_name ) )
        if self.restrict_job_name_length:
            job_name = job_name[:self.restrict_job_name_length]
        return job_name
Example #8
0
class DRMAAJobRunner(AsynchronousJobRunner):
    """
    Job runner backed by a finite pool of worker threads. FIFO scheduling
    """
    runner_name = "DRMAARunner"

    def __init__(self, app, nworkers, **kwargs):
        """Start the job runner"""
        global drmaa

        runner_param_specs = dict(
            drmaa_library_path=dict(map=str,
                                    default=os.environ.get(
                                        'DRMAA_LIBRARY_PATH', None)),
            invalidjobexception_state=dict(
                map=str,
                valid=lambda x: x in
                (model.Job.states.OK, model.Job.states.ERROR),
                default=model.Job.states.OK),
            invalidjobexception_retries=dict(map=int,
                                             valid=lambda x: int >= 0,
                                             default=0),
            internalexception_state=dict(
                map=str,
                valid=lambda x: x in
                (model.Job.states.OK, model.Job.states.ERROR),
                default=model.Job.states.OK),
            internalexception_retries=dict(map=int,
                                           valid=lambda x: int >= 0,
                                           default=0))

        if 'runner_param_specs' not in kwargs:
            kwargs['runner_param_specs'] = dict()
        kwargs['runner_param_specs'].update(runner_param_specs)

        super(DRMAAJobRunner, self).__init__(app, nworkers, **kwargs)

        # This allows multiple drmaa runners (although only one per handler) in the same job config file
        if 'drmaa_library_path' in kwargs:
            log.info(
                'Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s',
                self.runner_params.drmaa_library_path)
            os.environ[
                'DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path

        # Import is delayed until runner initialization to allow for the
        # drmaa_library_path plugin param to override $DRMAA_LIBRARY_PATH
        try:
            drmaa = __import__("drmaa")
        except (ImportError, RuntimeError) as exc:
            raise exc.__class__(
                'The Python drmaa package is required to use this '
                'feature, please install it or correct the '
                'following error:\n%s: %s' %
                (exc.__class__.__name__, str(exc)))

        # Subclasses may need access to state constants
        self.drmaa_job_states = drmaa.JobState

        # Descriptive state strings pulled from the drmaa lib itself
        self.drmaa_job_state_strings = {
            drmaa.JobState.UNDETERMINED: 'process status cannot be determined',
            drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active',
            drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold',
            drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold',
            drmaa.JobState.USER_SYSTEM_ON_HOLD:
            'job is queued and in user and system hold',
            drmaa.JobState.RUNNING: 'job is running',
            drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended',
            drmaa.JobState.USER_SUSPENDED: 'job is user suspended',
            drmaa.JobState.DONE: 'job finished normally',
            drmaa.JobState.FAILED: 'job finished, but failed',
        }

        # Ensure a DRMAA session exists and is initialized
        self.ds = DrmaaSessionFactory().get()

        self.userid = None

        self._init_monitor_thread()
        self._init_worker_threads()

    def url_to_destination(self, url):
        """Convert a legacy URL to a job destination"""
        if not url:
            return
        native_spec = url.split('/')[2]
        if native_spec:
            params = dict(nativeSpecification=native_spec)
            log.debug(
                "Converted URL '%s' to destination runner=drmaa, params=%s" %
                (url, params))
            return JobDestination(runner='drmaa', params=params)
        else:
            log.debug("Converted URL '%s' to destination runner=drmaa" % url)
            return JobDestination(runner='drmaa')

    def get_native_spec(self, url):
        """Get any native DRM arguments specified by the site configuration"""
        try:
            return url.split('/')[2] or None
        except:
            return None

    def queue_job(self, job_wrapper):
        """Create job script and submit it to the DRM"""
        # prepare the job

        # external_runJob_script can be None, in which case it's not used.
        external_runjob_script = job_wrapper.get_destination_configuration(
            "drmaa_external_runjob_script", None)

        include_metadata = asbool(
            job_wrapper.job_destination.params.get("embed_metadata_in_job",
                                                   True))
        if not self.prepare_job(job_wrapper,
                                include_metadata=include_metadata):
            return

        # get configured job destination
        job_destination = job_wrapper.job_destination

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        # define job attributes
        job_name = 'g%s' % galaxy_id_tag
        if job_wrapper.tool.old_id:
            job_name += '_%s' % job_wrapper.tool.old_id
        if external_runjob_script is None:
            job_name += '_%s' % job_wrapper.user
        job_name = ''.join(
            map(
                lambda x: x if x in
                (string.letters + string.digits + '_') else '_', job_name))
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper,
                                   job_name=job_name)

        # set up the drmaa job template
        jt = dict(remoteCommand=ajs.job_file,
                  jobName=ajs.job_name,
                  workingDirectory=job_wrapper.working_directory,
                  outputPath=":%s" % ajs.output_file,
                  errorPath=":%s" % ajs.error_file)

        # Avoid a jt.exitCodePath for now - it's only used when finishing.
        native_spec = job_destination.params.get('nativeSpecification', None)
        if native_spec is not None:
            jt['nativeSpecification'] = native_spec

        # fill in the DRM's job run template
        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ajs.exit_code_file)
        try:
            self.write_executable_script(ajs.job_file, script)
        except:
            job_wrapper.fail("failure preparing job script", exception=True)
            log.exception("(%s) failure writing job script" % galaxy_id_tag)
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug("(%s) Job deleted by user before it entered the queue" %
                      galaxy_id_tag)
            if job_wrapper.cleanup_job in ("always", "onsuccess"):
                job_wrapper.cleanup()
            return

        log.debug("(%s) submitting file %s", galaxy_id_tag, ajs.job_file)
        if native_spec:
            log.debug("(%s) native specification is: %s", galaxy_id_tag,
                      native_spec)

        # runJob will raise if there's a submit problem
        if external_runjob_script is None:
            # TODO: create a queue for retrying submission indefinitely
            # TODO: configurable max tries and sleep
            trynum = 0
            external_job_id = None
            fail_msg = None
            while external_job_id is None and trynum < 5:
                try:
                    external_job_id = self.ds.run_job(**jt)
                    break
                except (drmaa.InternalException,
                        drmaa.DeniedByDrmException) as e:
                    trynum += 1
                    log.warning(
                        '(%s) drmaa.Session.runJob() failed, will retry: %s',
                        galaxy_id_tag, e)
                    fail_msg = "Unable to run this job due to a cluster error, please retry it later"
                    time.sleep(5)
                except:
                    log.exception(
                        '(%s) drmaa.Session.runJob() failed unconditionally',
                        galaxy_id_tag)
                    trynum = 5
            else:
                log.error("(%s) All attempts to submit job failed" %
                          galaxy_id_tag)
                if not fail_msg:
                    fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE
                job_wrapper.fail(fail_msg)
                return
        else:
            job_wrapper.change_ownership_for_run()
            # if user credentials are not available, use galaxy credentials (if permitted)
            allow_guests = asbool(
                job_wrapper.job_destination.params.get("allow_guests", False))
            pwent = job_wrapper.user_system_pwent
            if pwent is None:
                if not allow_guests:
                    fail_msg = "User %s is not mapped to any real user, and not permitted to start jobs." % job_wrapper.user
                    job_wrapper.fail(fail_msg)
                    return
                pwent = job_wrapper.galaxy_system_pwent
            log.debug('(%s) submitting with credentials: %s [uid: %s]' %
                      (galaxy_id_tag, pwent[0], pwent[2]))
            filename = self.store_jobtemplate(job_wrapper, jt)
            self.userid = pwent[2]
            external_job_id = self.external_runjob(external_runjob_script,
                                                   filename, pwent[2]).strip()
        log.info("(%s) queued as %s" % (galaxy_id_tag, external_job_id))

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_job_destination(job_destination, external_job_id)

        # Store DRM related state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put(ajs)

    def _complete_terminal_job(self, ajs, drmaa_state, **kwargs):
        """
        Handle a job upon its termination in the DRM. This method is meant to
        be overridden by subclasses to improve post-mortem and reporting of
        failures.
        """
        if drmaa_state == drmaa.JobState.FAILED:
            if ajs.job_wrapper.get_state() != model.Job.states.DELETED:
                ajs.stop_job = False
                ajs.fail_message = "The cluster DRM system terminated this job"
                self.work_queue.put((self.fail_job, ajs))
        elif drmaa_state == drmaa.JobState.DONE:
            # External metadata processing for external runjobs
            external_metadata = not asbool(
                ajs.job_wrapper.job_destination.params.get(
                    "embed_metadata_in_job", True))
            if external_metadata:
                self._handle_metadata_externally(ajs.job_wrapper,
                                                 resolve_requirements=True)
            super(DRMAAJobRunner, self)._complete_terminal_job(ajs)

    def check_watched_items(self):
        """
        Called by the monitor thread to look at each watched job and deal
        with state changes.
        """
        new_watched = []
        for ajs in self.watched:
            external_job_id = ajs.job_id
            galaxy_id_tag = ajs.job_wrapper.get_id_tag()
            old_state = ajs.old_state
            try:
                assert external_job_id not in (
                    None, 'None'), '(%s/%s) Invalid job id' % (galaxy_id_tag,
                                                               external_job_id)
                state = self.ds.job_status(external_job_id)
            except (drmaa.InternalException, drmaa.InvalidJobException) as e:
                if isinstance(e, drmaa.InvalidJobException):
                    ecn = "InvalidJobException".lower()
                else:
                    ecn = "InternalException".lower()
                retry_param = ecn.lower() + '_retries'
                state_param = ecn.lower() + '_state'
                retries = getattr(ajs, retry_param, 0)
                if self.runner_params[retry_param] > 0:
                    if retries < self.runner_params[retry_param]:
                        # will retry check on next iteration
                        setattr(ajs, retry_param, retries + 1)
                        continue
                if self.runner_params[state_param] == model.Job.states.OK:
                    log.info(
                        "(%s/%s) job left DRM queue with following message: %s",
                        galaxy_id_tag, external_job_id, e)
                    self.work_queue.put((self.finish_job, ajs))
                elif self.runner_params[state_param] == model.Job.states.ERROR:
                    log.info(
                        "(%s/%s) job check resulted in %s after %s tries: %s",
                        galaxy_id_tag, external_job_id, ecn, retries, e)
                    self.work_queue.put((self.fail_job, ajs))
                else:
                    raise Exception(
                        "%s is set to an invalid value (%s), this should not be possible. See galaxy.jobs.drmaa.__init__()",
                        state_param, self.runner_params[state_param])
                continue
            except drmaa.DrmCommunicationException as e:
                log.warning("(%s/%s) unable to communicate with DRM: %s",
                            galaxy_id_tag, external_job_id, e)
                new_watched.append(ajs)
                continue
            except Exception as e:
                # so we don't kill the monitor thread
                log.exception("(%s/%s) Unable to check job status: %s" %
                              (galaxy_id_tag, external_job_id, str(e)))
                log.warning("(%s/%s) job will now be errored" %
                            (galaxy_id_tag, external_job_id))
                ajs.fail_message = "Cluster could not complete job"
                self.work_queue.put((self.fail_job, ajs))
                continue
            if state != old_state:
                log.debug("(%s/%s) state change: %s" %
                          (galaxy_id_tag, external_job_id,
                           self.drmaa_job_state_strings[state]))
            if state == drmaa.JobState.RUNNING and not ajs.running:
                ajs.running = True
                ajs.job_wrapper.change_state(model.Job.states.RUNNING)
            if state in (drmaa.JobState.FAILED, drmaa.JobState.DONE):
                self._complete_terminal_job(ajs, drmaa_state=state)
                continue
            if ajs.check_limits():
                self.work_queue.put((self.fail_job, ajs))
                continue
            ajs.old_state = state
            new_watched.append(ajs)
        # Replace the watch list with the updated version
        self.watched = new_watched

    def stop_job(self, job):
        """Attempts to delete a job from the DRM queue"""
        try:
            ext_id = job.get_job_runner_external_id()
            assert ext_id not in (None, 'None'), 'External job id is None'
            kill_script = job.get_destination_configuration(
                self.app.config, "drmaa_external_killjob_script", None)
            if kill_script is None:
                self.ds.kill(ext_id)
            else:
                # FIXME: hardcoded path
                subprocess.Popen([
                    '/usr/bin/sudo', '-E', kill_script,
                    str(ext_id),
                    str(self.userid)
                ],
                                 shell=False)
            log.debug("(%s/%s) Removed from DRM queue at user's request" %
                      (job.get_id(), ext_id))
        except drmaa.InvalidJobException:
            log.debug(
                "(%s/%s) User killed running job, but it was already dead" %
                (job.get_id(), ext_id))
        except Exception as e:
            log.debug(
                "(%s/%s) User killed running job, but error encountered removing from DRM queue: %s"
                % (job.get_id(), ext_id, e))

    def recover(self, job, job_wrapper):
        """Recovers jobs stuck in the queued/running state when Galaxy started"""
        job_id = job.get_job_runner_external_id()
        if job_id is None:
            self.put(job_wrapper)
            return
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper)
        ajs.job_id = str(job_id)
        ajs.command_line = job.get_command_line()
        ajs.job_wrapper = job_wrapper
        ajs.job_destination = job_wrapper.job_destination
        if job.state == model.Job.states.RUNNING:
            log.debug(
                "(%s/%s) is still in running state, adding to the DRM queue" %
                (job.get_id(), job.get_job_runner_external_id()))
            ajs.old_state = drmaa.JobState.RUNNING
            ajs.running = True
            self.monitor_queue.put(ajs)
        elif job.get_state() == model.Job.states.QUEUED:
            log.debug(
                "(%s/%s) is still in DRM queued state, adding to the DRM queue"
                % (job.get_id(), job.get_job_runner_external_id()))
            ajs.old_state = drmaa.JobState.QUEUED_ACTIVE
            ajs.running = False
            self.monitor_queue.put(ajs)

    def store_jobtemplate(self, job_wrapper, jt):
        """ Stores the content of a DRMAA JobTemplate object in a file as a JSON string.
        Path is hard-coded, but it's no worse than other path in this module.
        Uses Galaxy's JobID, so file is expected to be unique."""
        filename = "%s/%s.jt_json" % (self.app.config.cluster_files_directory,
                                      job_wrapper.get_id_tag())
        with open(filename, 'w+') as fp:
            json.dump(jt, fp)
        log.debug('(%s) Job script for external submission is: %s' %
                  (job_wrapper.job_id, filename))
        return filename

    def external_runjob(self, external_runjob_script, jobtemplate_filename,
                        username):
        """ runs an external script the will QSUB a new job.
        The external script will be run with sudo, and will setuid() to the specified user.
        Effectively, will QSUB as a different user (then the one used by Galaxy).
        """
        script_parts = external_runjob_script.split()
        script = script_parts[0]
        command = ['/usr/bin/sudo', '-E', script]
        for script_argument in script_parts[1:]:
            command.append(script_argument)

        command.extend([str(username), jobtemplate_filename])
        log.info("Running command %s" % command)
        p = subprocess.Popen(command,
                             shell=False,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        (stdoutdata, stderrdata) = p.communicate()
        exitcode = p.returncode
        # os.unlink(jobtemplate_filename)
        if exitcode != 0:
            # There was an error in the child process
            raise RuntimeError(
                "External_runjob failed (exit code %s)\nChild process reported error:\n%s"
                % (str(exitcode), stderrdata))
        if not stdoutdata.strip():
            raise RuntimeError("External_runjob did return the job id: %s" %
                               (stdoutdata))

        # The expected output is a single line containing a single numeric value:
        # the DRMAA job-ID. If not the case, will throw an error.
        jobId = stdoutdata
        return jobId
Example #9
0
class DRMAAJobRunner(AsynchronousJobRunner):
    """
    Job runner backed by a finite pool of worker threads. FIFO scheduling
    """
    runner_name = "DRMAARunner"
    restrict_job_name_length = 15

    def __init__(self, app, nworkers, **kwargs):
        """Start the job runner"""
        global drmaa

        runner_param_specs = {
            'drmaa_library_path':
            dict(map=str, default=os.environ.get('DRMAA_LIBRARY_PATH', None))
        }
        for retry_exception in RETRY_EXCEPTIONS_LOWER:
            runner_param_specs[retry_exception + '_state'] = dict(
                map=str,
                valid=lambda x: x in
                (model.Job.states.OK, model.Job.states.ERROR),
                default=model.Job.states.OK)
            runner_param_specs[retry_exception + '_retries'] = dict(
                map=int, valid=lambda x: int(x) >= 0, default=0)

        if 'runner_param_specs' not in kwargs:
            kwargs['runner_param_specs'] = dict()
        kwargs['runner_param_specs'].update(runner_param_specs)

        super().__init__(app, nworkers, **kwargs)

        # This allows multiple drmaa runners (although only one per handler) in the same job config file
        if 'drmaa_library_path' in kwargs:
            log.info(
                'Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s',
                self.runner_params.drmaa_library_path)
            os.environ[
                'DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path

        # Import is delayed until runner initialization to allow for the
        # drmaa_library_path plugin param to override $DRMAA_LIBRARY_PATH
        try:
            drmaa = __import__("drmaa")
        except (ImportError, RuntimeError) as exc:
            raise exc.__class__(
                'The Python drmaa package is required to use this '
                'feature, please install it or correct the '
                'following error:\n%s: %s' %
                (exc.__class__.__name__, str(exc)))
        from pulsar.managers.util.drmaa import DrmaaSessionFactory

        # make the drmaa library also available to subclasses
        self.drmaa = drmaa

        # Subclasses may need access to state constants
        self.drmaa_job_states = drmaa.JobState

        # Descriptive state strings pulled from the drmaa lib itself
        self.drmaa_job_state_strings = {
            drmaa.JobState.UNDETERMINED: 'process status cannot be determined',
            drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active',
            drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold',
            drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold',
            drmaa.JobState.USER_SYSTEM_ON_HOLD:
            'job is queued and in user and system hold',
            drmaa.JobState.RUNNING: 'job is running',
            drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended',
            drmaa.JobState.USER_SUSPENDED: 'job is user suspended',
            drmaa.JobState.DONE: 'job finished normally',
            drmaa.JobState.FAILED: 'job finished, but failed',
        }

        # Ensure a DRMAA session exists and is initialized
        self.ds = DrmaaSessionFactory().get()

        self.userid = None

        self._init_monitor_thread()
        self._init_worker_threads()
        self.redact_email_in_job_name = self.app.config.redact_email_in_job_name

    def url_to_destination(self, url):
        """Convert a legacy URL to a job destination"""
        if not url:
            return
        native_spec = url.split('/')[2]
        if native_spec:
            params = dict(nativeSpecification=native_spec)
            log.debug(
                f"Converted URL '{url}' to destination runner=drmaa, params={params}"
            )
            return JobDestination(runner='drmaa', params=params)
        else:
            log.debug("Converted URL '%s' to destination runner=drmaa" % url)
            return JobDestination(runner='drmaa')

    def get_native_spec(self, url):
        """Get any native DRM arguments specified by the site configuration"""
        try:
            return url.split('/')[2] or None
        except Exception:
            return None

    def queue_job(self, job_wrapper):
        """Create job script and submit it to the DRM"""
        # prepare the job

        # external_runJob_script can be None, in which case it's not used.
        external_runjob_script = job_wrapper.get_destination_configuration(
            "drmaa_external_runjob_script", None)

        include_metadata = asbool(
            job_wrapper.job_destination.params.get("embed_metadata_in_job",
                                                   True))
        if not self.prepare_job(job_wrapper,
                                include_metadata=include_metadata):
            return

        # get configured job destination
        job_destination = job_wrapper.job_destination

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        job_name = self._job_name(job_wrapper)
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper,
                                   job_name=job_name)

        # set up the drmaa job template
        jt = dict(remoteCommand=ajs.job_file,
                  jobName=ajs.job_name,
                  workingDirectory=job_wrapper.working_directory,
                  outputPath=":%s" % ajs.output_file,
                  errorPath=":%s" % ajs.error_file)

        # Avoid a jt.exitCodePath for now - it's only used when finishing.
        native_spec = job_destination.params.get('nativeSpecification', None)
        if native_spec is None:
            native_spec = job_destination.params.get('native_specification',
                                                     None)
        if native_spec is not None:
            jt['nativeSpecification'] = native_spec

        # fill in the DRM's job run template
        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ajs.exit_code_file,
                                   shell=job_wrapper.shell)
        try:
            self.write_executable_script(ajs.job_file, script)
        except Exception:
            job_wrapper.fail("failure preparing job script", exception=True)
            log.exception("(%s) failure writing job script" % galaxy_id_tag)
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() in (model.Job.states.DELETED,
                                       model.Job.states.STOPPED):
            log.debug(
                "(%s) Job deleted/stopped by user before it entered the queue",
                galaxy_id_tag)
            if job_wrapper.cleanup_job in ("always", "onsuccess"):
                job_wrapper.cleanup()
            return

        log.debug("(%s) submitting file %s", galaxy_id_tag, ajs.job_file)
        if native_spec:
            log.debug("(%s) native specification is: %s", galaxy_id_tag,
                      native_spec)

        # runJob will raise if there's a submit problem
        if external_runjob_script is None:
            # TODO: create a queue for retrying submission indefinitely
            # TODO: configurable max tries and sleep
            trynum = 0
            external_job_id = None
            fail_msg = None
            while external_job_id is None and trynum < 5:
                try:
                    external_job_id = self.ds.run_job(**jt)
                    break
                except (drmaa.InternalException,
                        drmaa.DeniedByDrmException) as e:
                    trynum += 1
                    log.warning(
                        '(%s) drmaa.Session.runJob() failed, will retry: %s',
                        galaxy_id_tag, e)
                    fail_msg = "Unable to run this job due to a cluster error, please retry it later"
                    time.sleep(5)
                except Exception:
                    log.exception(
                        '(%s) drmaa.Session.runJob() failed unconditionally',
                        galaxy_id_tag)
                    trynum = 5
            else:
                log.error("(%s) All attempts to submit job failed" %
                          galaxy_id_tag)
                if not fail_msg:
                    fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE
                job_wrapper.fail(fail_msg)
                return
        else:
            job_wrapper.change_ownership_for_run()
            # if user credentials are not available, use galaxy credentials (if permitted)
            allow_guests = asbool(
                job_wrapper.job_destination.params.get("allow_guests", False))
            pwent = job_wrapper.user_system_pwent
            if pwent is None:
                if not allow_guests:
                    fail_msg = "User %s is not mapped to any real user, and not permitted to start jobs." % job_wrapper.user
                    job_wrapper.fail(fail_msg)
                    return
                pwent = job_wrapper.galaxy_system_pwent
            log.debug('({}) submitting with credentials: {} [uid: {}]'.format(
                galaxy_id_tag, pwent[0], pwent[2]))
            filename = self.store_jobtemplate(job_wrapper, jt)
            self.userid = pwent[2]
            external_job_id = self.external_runjob(external_runjob_script,
                                                   filename, pwent[2])
            if external_job_id is None:
                job_wrapper.fail("(%s) could not queue job" % galaxy_id_tag)
                return
        log.info(f"({galaxy_id_tag}) queued as {external_job_id}")

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_external_id(external_job_id)

        # Store DRM related state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put(ajs)

    def _complete_terminal_job(self, ajs, drmaa_state, **kwargs):
        """
        Handle a job upon its termination in the DRM. This method is meant to
        be overridden by subclasses to improve post-mortem and reporting of
        failures.
        Returns True if job was not actually terminal, None otherwise.
        (Note: This function always returns None. Hence this function actually
        does not determine if a job was terminal, but the implementation
        in the subclasses is supposed to do this.)
        """
        job_state = ajs.job_wrapper.get_state()
        if drmaa_state == drmaa.JobState.FAILED and job_state != model.Job.states.STOPPED:
            if job_state != model.Job.states.DELETED:
                ajs.stop_job = False
                ajs.fail_message = "The cluster DRM system terminated this job"
                self.work_queue.put((self.fail_job, ajs))
        elif drmaa_state == drmaa.JobState.DONE or job_state == model.Job.states.STOPPED:
            # External metadata processing for external runjobs
            external_metadata = not asbool(
                ajs.job_wrapper.job_destination.params.get(
                    "embed_metadata_in_job", True))
            if external_metadata:
                self._handle_metadata_externally(ajs.job_wrapper,
                                                 resolve_requirements=True)
            if job_state != model.Job.states.DELETED:
                self.work_queue.put((self.finish_job, ajs))

    def check_watched_item(self, ajs, new_watched):
        """
        look at a single watched job, determine its state, and deal with errors
        that could happen in this process. to be called from check_watched_items()
        returns the state or None if exceptions occurred
        in the latter case the job is appended to new_watched if a

        1 drmaa.InternalException,
        2 drmaa.InvalidJobExceptionnot, or
        3 drmaa.DrmCommunicationException occurred

        (which causes the job to be tested again in the next iteration of check_watched_items)

        - the job is finished as errored if any other exception occurs
        - the job is finished OK or errored after the maximum number of retries
          depending on the exception

        Note that None is returned in all cases where the loop in check_watched_items
        is to be continued
        """
        external_job_id = ajs.job_id
        galaxy_id_tag = ajs.job_wrapper.get_id_tag()
        state = None
        try:
            assert external_job_id not in (
                None,
                'None'), f'({galaxy_id_tag}/{external_job_id}) Invalid job id'
            state = self.ds.job_status(external_job_id)
            # Reset exception retries
            for retry_exception in RETRY_EXCEPTIONS_LOWER:
                setattr(ajs, retry_exception + '_retries', 0)
        except (drmaa.InternalException, drmaa.InvalidJobException) as e:
            ecn = type(e).__name__
            retry_param = ecn.lower() + '_retries'
            state_param = ecn.lower() + '_state'
            retries = getattr(ajs, retry_param, 0)
            log.warning(
                "(%s/%s) unable to check job status because of %s exception for %d consecutive tries: %s",
                galaxy_id_tag, external_job_id, ecn, retries + 1, e)
            if self.runner_params[retry_param] > 0:
                if retries < self.runner_params[retry_param]:
                    # will retry check on next iteration
                    setattr(ajs, retry_param, retries + 1)
                    new_watched.append(ajs)
                    return None
            if self.runner_params[state_param] == model.Job.states.OK:
                log.warning("(%s/%s) job will now be finished OK",
                            galaxy_id_tag, external_job_id)
                self.work_queue.put((self.finish_job, ajs))
            elif self.runner_params[state_param] == model.Job.states.ERROR:
                log.warning("(%s/%s) job will now be errored", galaxy_id_tag,
                            external_job_id)
                self.work_queue.put((self.fail_job, ajs))
            else:
                raise Exception(
                    "%s is set to an invalid value (%s), this should not be possible. See galaxy.jobs.drmaa.__init__()",
                    state_param, self.runner_params[state_param])
            return None
        except drmaa.DrmCommunicationException as e:
            log.warning("(%s/%s) unable to communicate with DRM: %s",
                        galaxy_id_tag, external_job_id, e)
            new_watched.append(ajs)
            return None
        except Exception:
            # so we don't kill the monitor thread
            log.exception(
                f"({galaxy_id_tag}/{external_job_id}) unable to check job status"
            )
            log.warning(
                f"({galaxy_id_tag}/{external_job_id}) job will now be errored")
            ajs.fail_message = "Cluster could not complete job"
            self.work_queue.put((self.fail_job, ajs))
            return None
        return state

    def check_watched_items(self):
        """
        Called by the monitor thread to look at each watched job and deal
        with state changes.
        """
        new_watched = []
        for ajs in self.watched:
            external_job_id = ajs.job_id
            galaxy_id_tag = ajs.job_wrapper.get_id_tag()
            old_state = ajs.old_state
            state = self.check_watched_item(ajs, new_watched)
            if state is None:
                continue
            if state != old_state:
                log.debug("({}/{}) state change: {}".format(
                    galaxy_id_tag, external_job_id,
                    self.drmaa_job_state_strings[state]))
            if state == drmaa.JobState.RUNNING and not ajs.running:
                ajs.running = True
                ajs.job_wrapper.change_state(model.Job.states.RUNNING)
            if state in (drmaa.JobState.FAILED, drmaa.JobState.DONE):
                if self._complete_terminal_job(ajs,
                                               drmaa_state=state) is not None:
                    # job was not actually terminal
                    state = ajs.old_state
                else:
                    continue
            if ajs.running:
                # TODO: stop checking at some point
                ajs.job_wrapper.check_for_entry_points()
            if ajs.check_limits():
                self.work_queue.put((self.fail_job, ajs))
                continue
            ajs.old_state = state
            new_watched.append(ajs)
        # Replace the watch list with the updated version
        self.watched = new_watched

    def stop_job(self, job_wrapper):
        """Attempts to delete a job from the DRM queue"""
        job = job_wrapper.get_job()
        try:
            ext_id = job.get_job_runner_external_id()
            assert ext_id not in (None, 'None'), 'External job id is None'
            kill_script = job_wrapper.get_destination_configuration(
                "drmaa_external_killjob_script")
            if kill_script is None:
                self.ds.kill(ext_id)
            else:
                cmd = shlex.split(kill_script)
                cmd.extend([str(ext_id), str(self.userid)])
                commands.execute(cmd)
            log.info(
                f"({job.id}/{ext_id}) Removed from DRM queue at user's request"
            )
        except drmaa.InvalidJobException:
            log.exception(
                f"({job.id}/{ext_id}) User killed running job, but it was already dead"
            )
        except commands.CommandLineException as e:
            log.error(
                "({}/{}) User killed running job, but command execution failed: {}"
                .format(job.id, ext_id, unicodify(e)))
        except Exception:
            log.exception(
                f"({job.id}/{ext_id}) User killed running job, but error encountered removing from DRM queue"
            )

    def recover(self, job, job_wrapper):
        """Recovers jobs stuck in the queued/running state when Galaxy started"""
        job_id = job.get_job_runner_external_id()
        if job_id is None:
            self.put(job_wrapper)
            return
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper)
        ajs.job_id = str(job_id)
        ajs.command_line = job.get_command_line()
        ajs.job_wrapper = job_wrapper
        ajs.job_destination = job_wrapper.job_destination
        if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED):
            log.debug(
                f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the DRM queue"
            )
            ajs.old_state = drmaa.JobState.RUNNING
            ajs.running = True
            self.monitor_queue.put(ajs)
        elif job.get_state() == model.Job.states.QUEUED:
            log.debug(
                f"({job.id}/{job.get_job_runner_external_id()}) is still in DRM queued state, adding to the DRM queue"
            )
            ajs.old_state = drmaa.JobState.QUEUED_ACTIVE
            ajs.running = False
            self.monitor_queue.put(ajs)

    def store_jobtemplate(self, job_wrapper, jt):
        """ Stores the content of a DRMAA JobTemplate object in a file as a JSON string.
        Path is hard-coded, but it's no worse than other path in this module.
        Uses Galaxy's JobID, so file is expected to be unique."""
        filename = f"{self.app.config.cluster_files_directory}/{job_wrapper.get_id_tag()}.jt_json"
        with open(filename, 'w+') as fp:
            json.dump(jt, fp)
        log.debug(
            f'({job_wrapper.job_id}) Job script for external submission is: {filename}'
        )
        return filename

    def external_runjob(self, external_runjob_script, jobtemplate_filename,
                        username):
        """ runs an external script that will QSUB a new job.
        The external script needs to be run with sudo, and will setuid() to the specified user.
        Effectively, will QSUB as a different user (than the one used by Galaxy).
        """
        cmd = shlex.split(external_runjob_script)
        cmd.extend([str(username), jobtemplate_filename])
        log.info("Running command %s" % cmd)
        try:
            stdoutdata = commands.execute(cmd).strip()
        except commands.CommandLineException:
            log.exception("External_runjob failed")
            return None
        # The expected output is a single line containing a single numeric value:
        # the DRMAA job-ID. If not the case, will throw an error.
        if not stdoutdata:
            log.exception(
                "External_runjob did not returned nothing instead of the job id"
            )
            return None
        return stdoutdata

    def _job_name(self, job_wrapper):
        external_runjob_script = job_wrapper.get_destination_configuration(
            "drmaa_external_runjob_script", None)
        galaxy_id_tag = job_wrapper.get_id_tag()

        # define job attributes
        job_name = 'g%s' % galaxy_id_tag
        if job_wrapper.tool.old_id:
            job_name += '_%s' % job_wrapper.tool.old_id
        if not self.redact_email_in_job_name and external_runjob_script is None:
            job_name += '_%s' % job_wrapper.user
        job_name = ''.join(x if x in (string.ascii_letters + string.digits +
                                      '_') else '_' for x in job_name)
        if self.restrict_job_name_length:
            job_name = job_name[:self.restrict_job_name_length]
        return job_name