Esempio n. 1
0
 def recover(self, job, job_wrapper):
     """Recovers jobs stuck in the queued/running state when Galaxy started"""
     job_id = job.get_job_runner_external_id()
     pbs_job_state = AsynchronousJobState()
     pbs_job_state.output_file = "%s/%s.o" % (
         self.app.config.cluster_files_directory, job.id)
     pbs_job_state.error_file = "%s/%s.e" % (
         self.app.config.cluster_files_directory, job.id)
     pbs_job_state.exit_code_file = "%s/%s.ec" % (
         self.app.config.cluster_files_directory, job.id)
     pbs_job_state.job_file = "%s/%s.sh" % (
         self.app.config.cluster_files_directory, job.id)
     pbs_job_state.job_id = str(job_id)
     pbs_job_state.runner_url = job_wrapper.get_job_runner_url()
     pbs_job_state.job_destination = job_wrapper.job_destination
     job_wrapper.command_line = job.command_line
     pbs_job_state.job_wrapper = job_wrapper
     if job.state == model.Job.states.RUNNING:
         log.debug(
             "(%s/%s) is still in running state, adding to the PBS queue" %
             (job.id, job.get_job_runner_external_id()))
         pbs_job_state.old_state = 'R'
         pbs_job_state.running = True
         self.monitor_queue.put(pbs_job_state)
     elif job.state == model.Job.states.QUEUED:
         log.debug(
             "(%s/%s) is still in PBS queued state, adding to the PBS queue"
             % (job.id, job.get_job_runner_external_id()))
         pbs_job_state.old_state = 'Q'
         pbs_job_state.running = False
         self.monitor_queue.put(pbs_job_state)
Esempio n. 2
0
 def recover(self, job, job_wrapper):
     """Recovers jobs stuck in the queued/running state when Galaxy started"""
     job_id = job.get_job_runner_external_id()
     pbs_job_state = AsynchronousJobState()
     pbs_job_state.output_file = "%s/%s.o" % (self.app.config.cluster_files_directory, job.id)
     pbs_job_state.error_file = "%s/%s.e" % (self.app.config.cluster_files_directory, job.id)
     pbs_job_state.exit_code_file = "%s/%s.ec" % (self.app.config.cluster_files_directory, job.id)
     pbs_job_state.job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job.id)
     pbs_job_state.job_id = str(job_id)
     pbs_job_state.runner_url = job_wrapper.get_job_runner_url()
     pbs_job_state.job_destination = job_wrapper.job_destination
     job_wrapper.command_line = job.command_line
     pbs_job_state.job_wrapper = job_wrapper
     if job.state == model.Job.states.RUNNING:
         log.debug(
             "(%s/%s) is still in running state, adding to the PBS queue"
             % (job.id, job.get_job_runner_external_id())
         )
         pbs_job_state.old_state = "R"
         pbs_job_state.running = True
         self.monitor_queue.put(pbs_job_state)
     elif job.state == model.Job.states.QUEUED:
         log.debug(
             "(%s/%s) is still in PBS queued state, adding to the PBS queue"
             % (job.id, job.get_job_runner_external_id())
         )
         pbs_job_state.old_state = "Q"
         pbs_job_state.running = False
         self.monitor_queue.put(pbs_job_state)
Esempio n. 3
0
    def queue_job(self, job_wrapper):
        """Create PBS script for a job and submit it to the PBS queue"""
        # prepare the job
        if not self.prepare_job(
                job_wrapper,
                include_metadata=not (self.app.config.pbs_stage_path)):
            return

        job_destination = job_wrapper.job_destination

        # Determine the job's PBS destination (server/queue) and options from the job destination definition
        pbs_queue_name = None
        pbs_server_name = self.default_pbs_server
        pbs_options = []
        if '-q' in job_destination.params and 'destination' not in job_destination.params:
            job_destination.params['destination'] = job_destination.params.pop(
                '-q')
        if 'destination' in job_destination.params:
            if '@' in job_destination.params['destination']:
                # Destination includes a server
                pbs_queue_name, pbs_server_name = job_destination.params[
                    'destination'].split('@')
                if pbs_queue_name == '':
                    # e.g. `qsub -q @server`
                    pbs_queue_name = None
            else:
                # Destination is just a queue
                pbs_queue_name = job_destination.params['destination']
            job_destination.params.pop('destination')

        # Parse PBS params
        pbs_options = self.parse_destination_params(job_destination.params)

        # Explicitly set the determined PBS destination in the persisted job destination for recovery
        job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or
                                                           '', pbs_server_name)

        c = pbs.pbs_connect(util.smart_str(pbs_server_name))
        if c <= 0:
            errno, text = pbs.error()
            job_wrapper.fail(
                "Unable to queue job for execution.  Resubmitting the job may succeed."
            )
            log.error("Connection to PBS server for submit failed: %s: %s" %
                      (errno, text))
            return

        # define job attributes
        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory,
                             job_wrapper.job_id)
        efile = "%s/%s.e" % (self.app.config.cluster_files_directory,
                             job_wrapper.job_id)
        ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory,
                               job_wrapper.job_id)

        output_fnames = job_wrapper.get_output_fnames()

        # If an application server is set, we're staging
        if self.app.config.pbs_application_server:
            pbs_ofile = self.app.config.pbs_application_server + ':' + ofile
            pbs_efile = self.app.config.pbs_application_server + ':' + efile
            output_files = [str(o) for o in output_fnames]
            output_files.append(ecfile)
            stagein = self.get_stage_in_out(job_wrapper.get_input_fnames() +
                                            output_files,
                                            symlink=True)
            stageout = self.get_stage_in_out(output_files)
            attrs = [
                dict(name=pbs.ATTR_o, value=pbs_ofile),
                dict(name=pbs.ATTR_e, value=pbs_efile),
                dict(name=pbs.ATTR_stagein, value=stagein),
                dict(name=pbs.ATTR_stageout, value=stageout),
            ]
        # If not, we're using NFS
        else:
            attrs = [
                dict(name=pbs.ATTR_o, value=ofile),
                dict(name=pbs.ATTR_e, value=efile),
            ]

        # define PBS job options
        attrs.append(
            dict(name=pbs.ATTR_N,
                 value=str("%s_%s_%s" %
                           (job_wrapper.job_id, job_wrapper.tool.id,
                            job_wrapper.user))))
        job_attrs = pbs.new_attropl(len(attrs) + len(pbs_options))
        for i, attr in enumerate(attrs + pbs_options):
            job_attrs[i].name = attr['name']
            job_attrs[i].value = attr['value']
            if 'resource' in attr:
                job_attrs[i].resource = attr['resource']
        exec_dir = os.path.abspath(job_wrapper.working_directory)

        # write the job script
        if self.app.config.pbs_stage_path != '':
            # touch the ecfile so that it gets staged
            with file(ecfile, 'a'):
                os.utime(ecfile, None)

            stage_commands = pbs_symlink_template % (
                " ".join(job_wrapper.get_input_fnames() + output_files),
                self.app.config.pbs_stage_path,
                exec_dir,
            )
        else:
            stage_commands = ''

        env_setup_commands = [stage_commands]
        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ecfile,
                                   env_setup_commands=env_setup_commands)
        job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory,
                                 job_wrapper.job_id)
        self.write_executable_script(job_file, script)
        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug(
                "Job %s deleted by user before it entered the PBS queue" %
                job_wrapper.job_id)
            pbs.pbs_disconnect(c)
            if self.app.config.cleanup_job in ("always", "onsuccess"):
                self.cleanup((ofile, efile, ecfile, job_file))
                job_wrapper.cleanup()
            return

        # submit
        # The job tag includes the job and the task identifier
        # (if a TaskWrapper was passed in):
        galaxy_job_id = job_wrapper.get_id_tag()
        log.debug("(%s) submitting file %s" % (galaxy_job_id, job_file))

        tries = 0
        while tries < 5:
            job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name,
                                    None)
            tries += 1
            if job_id:
                pbs.pbs_disconnect(c)
                break
            errno, text = pbs.error()
            log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" %
                        (galaxy_job_id, tries, errno, text))
            time.sleep(2)
        else:
            log.error("(%s) All attempts to submit job failed" % galaxy_job_id)
            job_wrapper.fail(
                "Unable to run this job due to a cluster error, please retry it later"
            )
            return

        if pbs_queue_name is None:
            log.debug("(%s) queued in default queue as %s" %
                      (galaxy_job_id, job_id))
        else:
            log.debug("(%s) queued in %s queue as %s" %
                      (galaxy_job_id, pbs_queue_name, job_id))

        # persist destination
        job_wrapper.set_job_destination(job_destination, job_id)

        # Store PBS related state information for job
        job_state = AsynchronousJobState()
        job_state.job_wrapper = job_wrapper
        job_state.job_id = job_id
        job_state.job_file = job_file
        job_state.output_file = ofile
        job_state.error_file = efile
        job_state.exit_code_file = ecfile
        job_state.old_state = 'N'
        job_state.running = False
        job_state.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put(job_state)
Esempio n. 4
0
    def queue_job( self, job_wrapper ):
        """Create PBS script for a job and submit it to the PBS queue"""
        # prepare the job
        if not self.prepare_job( job_wrapper, include_metadata=not( self.app.config.pbs_stage_path ) ):
            return

        job_destination = job_wrapper.job_destination

        # Determine the job's PBS destination (server/queue) and options from the job destination definition
        pbs_queue_name = None
        pbs_server_name = self.default_pbs_server
        pbs_options = []
        if '-q' in job_destination.params and 'destination' not in job_destination.params:
            job_destination.params['destination'] = job_destination.params.pop('-q')
        if 'destination' in job_destination.params:
            if '@' in job_destination.params['destination']:
                # Destination includes a server
                pbs_queue_name, pbs_server_name = job_destination.params['destination'].split('@')
                if pbs_queue_name == '':
                    # e.g. `qsub -q @server`
                    pbs_queue_name = None
            else:
                # Destination is just a queue
                pbs_queue_name = job_destination.params['destination']
            job_destination.params.pop('destination')

        # Parse PBS params
        pbs_options = self.parse_destination_params(job_destination.params)

        # Explicitly set the determined PBS destination in the persisted job destination for recovery
        job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name)

        c = pbs.pbs_connect( util.smart_str( pbs_server_name ) )
        if c <= 0:
            errno, text = pbs.error()
            job_wrapper.fail( "Unable to queue job for execution.  Resubmitting the job may succeed." )
            log.error( "Connection to PBS server for submit failed: %s: %s" % ( errno, text ) )
            return

        # define job attributes
        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id)

        output_fnames = job_wrapper.get_output_fnames()

        # If an application server is set, we're staging
        if self.app.config.pbs_application_server:
            pbs_ofile = self.app.config.pbs_application_server + ':' + ofile
            pbs_efile = self.app.config.pbs_application_server + ':' + efile
            output_files = [ str( o ) for o in output_fnames ]
            output_files.append(ecfile)
            stagein = self.get_stage_in_out( job_wrapper.get_input_fnames() + output_files, symlink=True )
            stageout = self.get_stage_in_out( output_files )
            attrs = [
                dict( name=pbs.ATTR_o, value=pbs_ofile ),
                dict( name=pbs.ATTR_e, value=pbs_efile ),
                dict( name=pbs.ATTR_stagein, value=stagein ),
                dict( name=pbs.ATTR_stageout, value=stageout ),
            ]
        # If not, we're using NFS
        else:
            attrs = [
                dict( name=pbs.ATTR_o, value=ofile ),
                dict( name=pbs.ATTR_e, value=efile ),
            ]

        # define PBS job options
        attrs.append( dict( name=pbs.ATTR_N, value=str( "%s_%s_%s" % ( job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user ) ) ) )
        job_attrs = pbs.new_attropl( len( attrs ) + len( pbs_options ) )
        for i, attr in enumerate( attrs + pbs_options ):
            job_attrs[i].name = attr['name']
            job_attrs[i].value = attr['value']
            if 'resource' in attr:
                job_attrs[i].resource = attr['resource']
        exec_dir = os.path.abspath( job_wrapper.working_directory )

        # write the job script
        if self.app.config.pbs_stage_path != '':
            # touch the ecfile so that it gets staged
            with open(ecfile, 'a'):
                os.utime(ecfile, None)

            stage_commands = pbs_symlink_template % (
                " ".join( job_wrapper.get_input_fnames() + output_files ),
                self.app.config.pbs_stage_path,
                exec_dir,
            )
        else:
            stage_commands = ''

        env_setup_commands = [ stage_commands ]
        script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands)
        job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        self.write_executable_script( job_file, script )
        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id )
            pbs.pbs_disconnect(c)
            if job_wrapper.cleanup_job in ( "always", "onsuccess" ):
                self.cleanup( ( ofile, efile, ecfile, job_file ) )
                job_wrapper.cleanup()
            return

        # submit
        # The job tag includes the job and the task identifier
        # (if a TaskWrapper was passed in):
        galaxy_job_id = job_wrapper.get_id_tag()
        log.debug("(%s) submitting file %s" % ( galaxy_job_id, job_file ) )

        tries = 0
        while tries < 5:
            job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None)
            tries += 1
            if job_id:
                pbs.pbs_disconnect(c)
                break
            errno, text = pbs.error()
            log.warning( "(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text) )
            time.sleep(2)
        else:
            log.error( "(%s) All attempts to submit job failed" % galaxy_job_id )
            job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" )
            return

        if pbs_queue_name is None:
            log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) )
        else:
            log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id) )

        # persist destination
        job_wrapper.set_job_destination( job_destination, job_id )

        # Store PBS related state information for job
        job_state = AsynchronousJobState()
        job_state.job_wrapper = job_wrapper
        job_state.job_id = job_id
        job_state.job_file = job_file
        job_state.output_file = ofile
        job_state.error_file = efile
        job_state.exit_code_file = ecfile
        job_state.old_state = 'N'
        job_state.running = False
        job_state.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( job_state )