コード例 #1
0
 def set_hold(self, hold_type=None):
     """Set hold on job of specified type."""
     # we can't set this default for hold_type in function signature,
     # because we need to be able to load this module even when the pbs module is not available
     if hold_type is None:
         hold_type = pbs.USER_HOLD
     # only set hold if it wasn't set before
     if hold_type not in self.holds:
         if hold_type not in KNOWN_HOLD_TYPES:
             self.log.error(
                 "set_hold: unknown hold type: %s (supported: %s)" %
                 (hold_type, KNOWN_HOLD_TYPES))
         # set hold, check for errors, and keep track of this hold
         ec = pbs.pbs_holdjob(self.pbsconn, self.jobid, hold_type, NULL)
         is_error, errormsg = pbs.error()
         if is_error or ec:
             tup = (hold_type, self.jobid, is_error, ec, errormsg)
             self.log.error(
                 "Failed to set hold of type %s on job %s (is_error: %s, exit code: %s, msg: %s)"
                 % tup)
         else:
             self.holds.append(hold_type)
     else:
         self.log.warning("Hold type %s was already set for %s" %
                          (hold_type, self.jobid))
コード例 #2
0
ファイル: set_property.py プロジェクト: FRidh/pbs_python
def main():
  pbs_server = pbs.pbs_default()
  if not pbs_server:
    print 'No default server'
    sys.exit(1)

  if len(sys.argv) < 2:
  	print "Usage: set_property.py <hostname>"
	sys.exit(1)

  hostname = sys.argv[1]

  con = pbs.pbs_connect(pbs_server)

  attrop_l = pbs.new_attropl(1)
  attrop_l[0].name  = 'note'
  attrop_l[0].value = 'set_something_useful'
  attrop_l[0].op    = pbs.SET

  r =  pbs.pbs_manager(con, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, 
                    hostname, attrop_l, 'NULL')

  if r > 0:
    print r, ";"
    errno, text = pbs.error() 
    print errno, text
コード例 #3
0
 def release_hold(self, hold_type=None):
     """Release hold on job of specified type."""
     # we can't set this default for hold_type in function signature,
     # because we need to be able to load this module even when the pbs module is not available
     if hold_type is None:
         hold_type = pbs.USER_HOLD
     # only release hold if it was set
     if hold_type in self.holds:
         if hold_type not in KNOWN_HOLD_TYPES:
             raise EasyBuildError(
                 "release_hold: unknown hold type: %s (supported: %s)",
                 hold_type, KNOWN_HOLD_TYPES)
         # release hold, check for errors, remove from list of holds
         ec = pbs.pbs_rlsjob(self.pbsconn, self.jobid, hold_type, NULL)
         self.log.debug("Released hold of type %s for job %s" %
                        (hold_type, self.jobid))
         is_error, errormsg = pbs.error()
         if is_error or ec:
             raise EasyBuildError(
                 "Failed to release hold type %s on job %s (is_error: %s, exit code: %s, msg: %s)",
                 hold_type, self.jobid, is_error, ec, errormsg)
         else:
             self.holds.remove(hold_type)
     else:
         self.log.warning(
             "No hold type %s was set for %s, so skipping hold release" %
             (hold_type, self.jobid))
コード例 #4
0
ファイル: torque.py プロジェクト: liek51/civet
def _connect_to_server(server=None):
    """
        open a connection to a pbs_server at hostname server, if server is None 
        then connect to the default server.
        
        This function is shared between JobManager and TorqueJobRunner
    """
    server_name = server if server else pbs.pbs_default()

    retry = 0
    connection = pbs.pbs_connect(server_name)

    while connection <= 0 and retry < _MAX_RETRY:
        retry += 1
        time.sleep(retry**2)
        connection = pbs.pbs_connect(server_name)

    if connection <= 0:
        e, e_msg = pbs.error()
        # the batch system returned an error, throw exception
        raise Exception("Error connecting to pbs_server.  "
                        "Torque error {0}: '{1}'".format(
                            e, torque_strerror(e)))

    return connection
コード例 #5
0
ファイル: torque.py プロジェクト: liek51/civet
    def submit_with_retry(pbs_attrs, script_path, queue, pbs_server=None):
        # connect to pbs server
        connection = _connect_to_server(pbs_server)

        # submit job
        retry = 0
        job_id = pbs.pbs_submit(connection, pbs_attrs, script_path, queue,
                                None)

        # if pbs.pbs_submit failed, try again
        while not job_id and retry < _MAX_RETRY:
            retry += 1
            print("Retrying connection...", file=sys.stderr)
            time.sleep(retry**2)
            job_id = pbs.pbs_submit(connection, pbs_attrs, script_path, queue,
                                    None)

        pbs.pbs_disconnect(connection)

        #check to see if the job was submitted successfully.
        if not job_id:
            e, e_msg = pbs.error()
            # the batch system returned an error, throw exception
            raise Exception("Error submitting job.  "
                            "Torque error {0}: '{1}'".format(
                                e, torque_strerror(e)))

        return job_id
コード例 #6
0
ファイル: sara_nodes.py プロジェクト: FRidh/pbs_python
    def _process(self, batch_list):
        '''This function execute the change to the batch server'''

        if ARGS_VERBOSE: 
            _print('class:SaraNodes func:_process input:%s' % str(batch_list), file=sys.stderr)

        ## Always get the pbs_server name, even in dry-run mode
        pbs_server = pbs.pbs_default()
        if not pbs_server:
            _print('Could not locate a pbs server', file=sys.stderr)
            sys.exit(1)

        if ARGS_VERBOSE:
            _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server, file=sys.stderr)

        ## If dry-run is not specified create a connection
        if not ARGS_DRYRUN:
            pbs_connection = pbs.pbs_connect(pbs_server)

        ## Execute the changes
        for node in batch_list:
            if not ARGS_DRYRUN:
                pbs_connection = pbs.pbs_connect(pbs_server)
                rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node[0], node[1], 'NULL')
                if rcode > 0:
                    errno, text = pbs.error()
                    _print('PBS error for node \'%s\': %s (%s)' % (node[0], text, errno), file=sys.stderr)
            else:
                _print("pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')" % (node[0], str(node[1])))

        ## Close the connection with the batch system
        if not ARGS_DRYRUN:
            pbs.pbs_disconnect(pbs_connection)
コード例 #7
0
def main():
    pbs_server = pbs.pbs_default()
    if not pbs_server:
        print 'No default server'
        sys.exit(1)

    if len(sys.argv) < 2:
        print "Usage: set_property.py <hostname>"
        sys.exit(1)

    hostname = sys.argv[1]

    con = pbs.pbs_connect(pbs_server)

    attrop_l = pbs.new_attropl(1)
    attrop_l[0].name = 'note'
    attrop_l[0].value = 'set_something_useful'
    attrop_l[0].op = pbs.SET

    r = pbs.pbs_manager(con, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, hostname,
                        attrop_l, 'NULL')

    if r > 0:
        print r, ";"
        errno, text = pbs.error()
        print errno, text
コード例 #8
0
ファイル: SchedulerPbs.py プロジェクト: dmwm/ProdCommon
 def pbs_conn(self):
     conn = pbs.pbs_connect(pbs.pbs_default())
     if (conn < 0):
         err, err_text = pbs.error()
         self.logging.error('Error in PBS server conncet')
         self.logging.error('PBS error code ' + str(err) + ': ' + err_text)
         raise SchedulerError('PBS error', str(err) + ': ' + err_text)
     return conn
コード例 #9
0
ファイル: SchedulerPbs.py プロジェクト: PerilousApricot/CRAB2
 def pbs_conn(self):
     conn=pbs.pbs_connect(pbs.pbs_default())
     if(conn<0):
         err, err_text = pbs.error()
         self.logging.error('Error in PBS server conncet')
         self.logging.error('PBS error code '+str(err)+': '+err_text)
         raise SchedulerError('PBS error', str(err)+': '+err_text)
     return conn
コード例 #10
0
ファイル: pbs_runner.py プロジェクト: licode/pyLight-Li
 def run(self):
     script_name = 'pylight_script'
     with open(script_name, 'w') as f:
         f.write(self.script)
     self.jobid = pbs.pbs_submit(self.connect(), self.attropl, script_name, 'batch', "NULL") 
     log.info("PBS submits a job %s" % self.jobid) 
     os.remove(script_name)
     e, text = pbs.error()
     if e:
         log.warning("Failed to submit a job: %s", text)
         self.status = main.job.JOB_STATUS.FAIL
     self.disconnect()
コード例 #11
0
    def run_cluster(self, pbs_server, job_script, settings):

        import pbs
        from threading import threa

        self.settings = copy.deepcopy(settings)
        # Launch script, wait for output to come back, return when it does

        # Create the job options struct
        attropl = pbs.new_attropl(4)

        # Set the name of the job
        #
        attropl[0].name = pbs.ATTR_N
        attropl[0].value = "inferno_" + self.name

        # Job is Rerunable
        #
        attropl[1].name = pbs.ATTR_r
        attropl[1].value = "y"

        # Walltime
        #
        attropl[2].name = pbs.ATTR_l
        attropl[2].resource = "walltime"
        attropl[2].value = "400"

        # Nodes
        #
        attropl[3].name = pbs.ATTR_l
        attropl[3].resource = "nodes"
        attropl[3].value = "1:ppn=4"

        # Run the job
        if pbs_server == None:
            pbs_server = pbs.pbs_default()
        job_id = pbs.pbs_submit(pbs_server, attropl, job_script, "NULL", "NULL")

        e, e_txt = pbs.error()
        if e:
            print e, e_txt

        # Save the job ID for later so we can check on the status
        self.job_id = job_id

        # TODO: Change this
        # Now loop, checking every 5 seconds or so if the job is done by
        # polling the pbs_server about the jobid.
        running = True
        while running:
            job_info = pbs.pbs_statjob(pbs_server, self.job_id, "NULL", "NULL")
            print job_info
            time.sleep(5)
コード例 #12
0
    def submitScript(script):
        result = {}
        try:
            pbs_connection = pbs.pbs_connect(pbs.pbs_default())
            #    queues = pbs.pbs_statque(pbs_connection, "batch", "NULL", "NULL")

            attropl = pbs.new_attropl(4)

            # Set the name of the job
            #
            attropl[0].name = pbs.ATTR_N
            attropl[0].value = str(script['jobName']) if script['jobName'] else "new_job"

            # Job is Rerunable
            #
            attropl[1].name = pbs.ATTR_r
            attropl[1].value = 'y'

            # Walltime
            #
            attropl[2].name = pbs.ATTR_l
            attropl[2].resource = 'walltime'
            attropl[2].value = str(script['maxTime']) if script['maxTime'] else '01:00:00'

            # Nodes
            #
            attropl[3].name = pbs.ATTR_l
            attropl[3].resource = 'nodes'
            attropl[3].value = '1:ppn=' + str(script['cpuNumber']) if script['cpuNumber'] else '1'


            # A1.tsk is the job script filename
            #
            job_id = pbs.pbs_submit(pbs_connection, attropl, str(script['scriptName']), str(script['queue']), 'NULL')

            e, e_txt = pbs.error()
            if e:
                result['Result'] = 'ERROR'
                result['Message'] = str(e) + ' : ' + e_txt
            else:
                result['Result'] = 'OK'
                result['Message'] = job_id
        except Exception as exc:
            result['Result'] = 'ERROR'
            result['Message'] = str(exc)

        return result
コード例 #13
0
ファイル: SchedulerPbs.py プロジェクト: PerilousApricot/CRAB2
    def kill(self, obj):

        conn=self.pbs_conn()

        for job in obj.jobs :
            if not self.valid( job.runningJob ): continue
            id=str(job.runningJob['schedulerId']).strip()
            res=pbs.pbs_deljob(conn, id, '')

            if res!=0:
                err, err_text=pbs.error()
                self.logging.error('Error in job kill for '+id)
                self.logging.error('PBS error code '+str(err)+': '+err_text)
                self.pbs_disconn(conn)
                raise SchedulerError('PBS error', str(err)+': '+err_text)
                    
        self.pbs_disconn(conn)
コード例 #14
0
ファイル: SchedulerPbs.py プロジェクト: dmwm/ProdCommon
    def kill(self, obj):

        conn = self.pbs_conn()

        for job in obj.jobs:
            if not self.valid(job.runningJob): continue
            id = str(job.runningJob['schedulerId']).strip()
            res = pbs.pbs_deljob(conn, id, '')

            if res != 0:
                err, err_text = pbs.error()
                self.logging.error('Error in job kill for ' + id)
                self.logging.error('PBS error code ' + str(err) + ': ' +
                                   err_text)
                self.pbs_disconn(conn)
                raise SchedulerError('PBS error', str(err) + ': ' + err_text)

        self.pbs_disconn(conn)
コード例 #15
0
ファイル: pbs_runner.py プロジェクト: licode/pyLight-Li
 def submit_jobs_pbs(self, jobs):
     for job in jobs:
         tool = job.tool
         command = job.create_command()            
         attropl = self.get_pbs_attr(job.db_job.id, tool.config)
         script = PBS_SCRIPT % (job.tool.directory, command)
         log.info(script)
         script_name = 'pylight_script'
         with open(script_name, 'w') as f:
             f.write(script)
         job_id = pbs.pbs_submit(self.c, attropl, script_name, 'batch', "NULL") 
         os.remove(script_name)
         e, text = pbs.error()
         if e:
             log.warning("Failed to submit a job: %s", text)
             #what about jobs that following this one?
             continue
         log.info("PBS submits a job %s as %s" % (job, job_id)) 
         self.submit_list[job_id] = job
コード例 #16
0
ファイル: SchedulerPbs.py プロジェクト: dmwm/ProdCommon
    def query(self, obj, service='', objType='node'):
        """
        query status and eventually other scheduler related information
        It may use single 'node' scheduler id or bulk id for association
        """
        if type(obj) != Task:
            raise SchedulerError('wrong argument type', str(type(obj)))

        jobids = []

        conn = self.pbs_conn()
        attrl = pbs.new_attrl(2)
        attrl[0].name = 'job_state'
        attrl[1].name = 'exec_host'

        for job in obj.jobs:
            if not self.valid(job.runningJob): continue
            id = str(job.runningJob['schedulerId']).strip()
            jobstat = pbs.pbs_statjob(conn, id, attrl, 'Null')

            if not jobstat:
                err, err_text = pbs.error()
                if err != 15001:  # unknown job (probably finished)
                    self.logging.error('Error in job query for ' + id)
                    self.logging.error('PBS error code ' + str(err) + ': ' +
                                       err_text)
                    self.pbs_disconn(conn)
                    raise SchedulerError('PBS error',
                                         str(err) + ': ' + err_text)

            host = ''
            if len(jobstat) == 0:
                pbs_stat = 'Done'
            else:
                pbs_stat = jobstat[0].attribs[0].value
                if len(jobstat[0].attribs) > 1:
                    host = jobstat[0].attribs[1].value
            job.runningJob['statusScheduler'] = pbs_stat
            job.runningJob['status'] = self.status_map[pbs_stat]
            job.runningJob['destination'] = host

        self.pbs_disconn(conn)
コード例 #17
0
ファイル: torque.py プロジェクト: ddurkin/civet
def _connect_to_server(server):
    """
        open a connection to a pbs_server at hostname server, if server is None 
        then connect to the default server.
        
        This function is shared between JobManager and TorqueJobRunner
    """
    if server:
        connection = pbs.pbs_connect(server)
    else:
        connection = pbs.pbs_connect(pbs.pbs_default())

    if connection <= 0:
        e, e_msg = pbs.error()
        # the batch system returned an error, throw exception
        raise Exception("Error connecting to pbs_server.  "
                        "Torque error {0}: '{1}'".format(
                            e, torque_strerror(e)))

    return connection
コード例 #18
0
ファイル: set_property.py プロジェクト: hocks/TSCC
def main():
  pbs_server = pbs.pbs_default()
  if not pbs_server:
    print 'No default server'
    sys.exit(1)

  con = pbs.pbs_connect(pbs_server)

  attrop_l = pbs.new_attropl(1)
  attrop_l[0].name  = 'properties'
  attrop_l[0].value = 'set_something_useful'
  attrop_l[0].op    = pbs.INCR

  r =  pbs.pbs_manager(con, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, 
                    "e2", attrop_l, 'NULL')

  if r > 0:
    print r, ";"
    errno, text = pbs.error() 
    print errno, text
コード例 #19
0
 def set_hold(self, hold_type=None):
     """Set hold on job of specified type."""
     # we can't set this default for hold_type in function signature,
     # because we need to be able to load this module even when the pbs module is not available
     if hold_type is None:
         hold_type = pbs.USER_HOLD
     # only set hold if it wasn't set before
     if hold_type not in self.holds:
         if hold_type not in KNOWN_HOLD_TYPES:
             self.log.error("set_hold: unknown hold type: %s (supported: %s)" % (hold_type, KNOWN_HOLD_TYPES))
         # set hold, check for errors, and keep track of this hold
         ec = pbs.pbs_holdjob(self.pbsconn, self.jobid, hold_type, NULL)
         is_error, errormsg = pbs.error()
         if is_error or ec:
             tup = (hold_type, self.jobid, is_error, ec, errormsg)
             self.log.error("Failed to set hold of type %s on job %s (is_error: %s, exit code: %s, msg: %s)" % tup)
         else:
             self.holds.append(hold_type)
     else:
         self.log.warning("Hold type %s was already set for %s" % (hold_type, self.jobid))
コード例 #20
0
    def _process(self, batch_list):
        '''This function execute the change to the batch server'''

        if ARGS_VERBOSE:
            _print('class:SaraNodes func:_process input:%s' % str(batch_list),
                   file=sys.stderr)

        ## Always get the pbs_server name, even in dry-run mode
        pbs_server = pbs.pbs_default()
        if not pbs_server:
            _print('Could not locate a pbs server', file=sys.stderr)
            sys.exit(1)

        if ARGS_VERBOSE:
            _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server,
                   file=sys.stderr)

        ## If dry-run is not specified create a connection
        if not ARGS_DRYRUN:
            pbs_connection = pbs.pbs_connect(pbs_server)

        ## Execute the changes
        for node in batch_list:
            if not ARGS_DRYRUN:
                pbs_connection = pbs.pbs_connect(pbs_server)
                rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET,
                                        pbs.MGR_OBJ_NODE, node[0], node[1],
                                        'NULL')
                if rcode > 0:
                    errno, text = pbs.error()
                    _print('PBS error for node \'%s\': %s (%s)' %
                           (node[0], text, errno),
                           file=sys.stderr)
            else:
                _print(
                    "pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')"
                    % (node[0], str(node[1])))

        ## Close the connection with the batch system
        if not ARGS_DRYRUN:
            pbs.pbs_disconnect(pbs_connection)
コード例 #21
0
 def release_hold(self, hold_type=None):
     """Release hold on job of specified type."""
     # we can't set this default for hold_type in function signature,
     # because we need to be able to load this module even when the pbs module is not available
     if hold_type is None:
         hold_type = pbs.USER_HOLD
     # only release hold if it was set
     if hold_type in self.holds:
         if hold_type not in KNOWN_HOLD_TYPES:
             self.log.error("release_hold: unknown hold type: %s (supported: %s)" % (hold_type, KNOWN_HOLD_TYPES))
         # release hold, check for errors, remove from list of holds
         ec = pbs.pbs_rlsjob(self.pbsconn, self.jobid, hold_type, NULL)
         self.log.debug("Released hold of type %s for job %s" % (hold_type, self.jobid))
         is_error, errormsg = pbs.error()
         if is_error or ec:
             tup = (hold_type, self.jobid, is_error, ec, errormsg)
             self.log.error("Failed to release hold type %s on job %s (is_error: %s, exit code: %s, msg: %s)" % tup)
         else:
             self.holds.remove(hold_type)
     else:
         self.log.warning("No hold type %s was set for %s, so skipping hold release" % (hold_type, self.jobid))
コード例 #22
0
ファイル: SchedulerPbs.py プロジェクト: PerilousApricot/CRAB2
    def query(self, obj, service='', objType='node') :
        """
        query status and eventually other scheduler related information
        It may use single 'node' scheduler id or bulk id for association
        """
        if type(obj) != Task :
            raise SchedulerError('wrong argument type', str( type(obj) ))

        jobids=[]

        conn=self.pbs_conn()
        attrl=pbs.new_attrl(2)
        attrl[0].name='job_state'
        attrl[1].name='exec_host'

        for job in obj.jobs :
            if not self.valid( job.runningJob ): continue
            id=str(job.runningJob['schedulerId']).strip()
            jobstat=pbs.pbs_statjob(conn, id, attrl, 'Null')

            if not jobstat:
                err, err_text=pbs.error()
                if err!=15001: # unknown job (probably finished)
                    self.logging.error('Error in job query for '+id)
                    self.logging.error('PBS error code '+str(err)+': '+err_text)
                    self.pbs_disconn(conn)
                    raise SchedulerError('PBS error', str(err)+': '+err_text)
        
            host=''
            if len(jobstat)==0:
                pbs_stat='Done'
            else:
                pbs_stat=jobstat[0].attribs[0].value
                if len(jobstat[0].attribs)>1: host=jobstat[0].attribs[1].value
            job.runningJob['statusScheduler']=pbs_stat
            job.runningJob['status'] = self.status_map[pbs_stat]
            job.runningJob['destination']=host
            
        self.pbs_disconn(conn)
コード例 #23
0
    def __init__(self, settings):
        # spawn own thread, start up queue, start connection to server
        self.queue = []
        self.finished = []
        self.running = []
        self.error = []
        self.job_ids = []
        self.threads = []
        self.use_cluster = False
        self.connection = None
        self.curr_id = 0
        self.settings = None

        running_threads = 0
        max_threads = 1
        self.settings = settings
        if settings["global"]["use_cluster"] == True:
            import pbs

            self.use_cluster = True
        else:
            self.use_cluster = False
            self.max_threads = settings["global"]["n_processors"]

        if self.use_cluster:
            # Establish connection to PBS server
            serv_addr = settings["global"]["cluster_address"]

            # Let the cluster's jobman handle scheduling
            self.max_threads = sys.maxint
            self.connection = pbs.pbs_connect(serv_addr)
            if self.connection < 0:
                errno, text = pbs.error()
                print "Error, unable to establish connection to PBS server."
                print errno, text
                sys.exit(1)
コード例 #24
0
ファイル: pbs.py プロジェクト: AAFC-MBB/galaxy-1
    def queue_job( self, job_wrapper ):
        """Create PBS script for a job and submit it to the PBS queue"""
        # prepare the job
        if not self.prepare_job( job_wrapper, include_metadata=not( self.app.config.pbs_stage_path ) ):
            return

        job_destination = job_wrapper.job_destination

        # Determine the job's PBS destination (server/queue) and options from the job destination definition
        pbs_queue_name = None
        pbs_server_name = self.default_pbs_server
        pbs_options = []
        if '-q' in job_destination.params and 'destination' not in job_destination.params:
            job_destination.params['destination'] = job_destination.params.pop('-q')
        if 'destination' in job_destination.params:
            if '@' in job_destination.params['destination']:
                # Destination includes a server
                pbs_queue_name, pbs_server_name = job_destination.params['destination'].split('@')
                if pbs_queue_name == '':
                    # e.g. `qsub -q @server`
                    pbs_queue_name = None
            else:
                # Destination is just a queue
                pbs_queue_name = job_destination.params['destination']
            job_destination.params.pop('destination')

        # Parse PBS params
        pbs_options = self.parse_destination_params(job_destination.params)

        # Explicitly set the determined PBS destination in the persisted job destination for recovery
        job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name)

        c = pbs.pbs_connect( util.smart_str( pbs_server_name ) )
        if c <= 0:
            errno, text = pbs.error()
            job_wrapper.fail( "Unable to queue job for execution.  Resubmitting the job may succeed." )
            log.error( "Connection to PBS server for submit failed: %s: %s" % ( errno, text ) )
            return

        # define job attributes
        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id)

        output_fnames = job_wrapper.get_output_fnames()

        # If an application server is set, we're staging
        if self.app.config.pbs_application_server:
            pbs_ofile = self.app.config.pbs_application_server + ':' + ofile
            pbs_efile = self.app.config.pbs_application_server + ':' + efile
            output_files = [ str( o ) for o in output_fnames ]
            output_files.append(ecfile)
            stagein = self.get_stage_in_out( job_wrapper.get_input_fnames() + output_files, symlink=True )
            stageout = self.get_stage_in_out( output_files )
            attrs = [
                dict( name=pbs.ATTR_o, value=pbs_ofile ),
                dict( name=pbs.ATTR_e, value=pbs_efile ),
                dict( name=pbs.ATTR_stagein, value=stagein ),
                dict( name=pbs.ATTR_stageout, value=stageout ),
            ]
        # If not, we're using NFS
        else:
            attrs = [
                dict( name=pbs.ATTR_o, value=ofile ),
                dict( name=pbs.ATTR_e, value=efile ),
            ]

        # define PBS job options
        attrs.append( dict( name=pbs.ATTR_N, value=str( "%s_%s_%s" % ( job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user ) ) ) )
        job_attrs = pbs.new_attropl( len( attrs ) + len( pbs_options ) )
        for i, attr in enumerate( attrs + pbs_options ):
            job_attrs[i].name = attr['name']
            job_attrs[i].value = attr['value']
            if 'resource' in attr:
                job_attrs[i].resource = attr['resource']
        exec_dir = os.path.abspath( job_wrapper.working_directory )

        # write the job script
        if self.app.config.pbs_stage_path != '':
            # touch the ecfile so that it gets staged
            with open(ecfile, 'a'):
                os.utime(ecfile, None)

            stage_commands = pbs_symlink_template % (
                " ".join( job_wrapper.get_input_fnames() + output_files ),
                self.app.config.pbs_stage_path,
                exec_dir,
            )
        else:
            stage_commands = ''

        env_setup_commands = [ stage_commands ]
        script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands)
        job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        self.write_executable_script( job_file, script )
        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id )
            pbs.pbs_disconnect(c)
            if job_wrapper.cleanup_job in ( "always", "onsuccess" ):
                self.cleanup( ( ofile, efile, ecfile, job_file ) )
                job_wrapper.cleanup()
            return

        # submit
        # The job tag includes the job and the task identifier
        # (if a TaskWrapper was passed in):
        galaxy_job_id = job_wrapper.get_id_tag()
        log.debug("(%s) submitting file %s" % ( galaxy_job_id, job_file ) )

        tries = 0
        while tries < 5:
            job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None)
            tries += 1
            if job_id:
                pbs.pbs_disconnect(c)
                break
            errno, text = pbs.error()
            log.warning( "(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text) )
            time.sleep(2)
        else:
            log.error( "(%s) All attempts to submit job failed" % galaxy_job_id )
            job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" )
            return

        if pbs_queue_name is None:
            log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) )
        else:
            log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id) )

        # persist destination
        job_wrapper.set_job_destination( job_destination, job_id )

        # Store PBS related state information for job
        job_state = AsynchronousJobState()
        job_state.job_wrapper = job_wrapper
        job_state.job_id = job_id
        job_state.job_file = job_file
        job_state.output_file = ofile
        job_state.error_file = efile
        job_state.exit_code_file = ecfile
        job_state.old_state = 'N'
        job_state.running = False
        job_state.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( job_state )
コード例 #25
0
        def pbs_batch( self, nodes, attrs=None, note_attributes=None ):
                nodeserror = list()
                if not attrs and not note_attributes:
                        raise sara_nodesException, 'attrs and note_attributes can not be empty together!'

                if not self.dryrun:
                        if note_attributes and len( note_attributes ) == 3:
                                if attrs:
                                        attributes = attrs + pbs.new_attropl(1)
                                        attributes[1].name = pbs.ATTR_NODE_note
                                        attributes[1].op = pbs.SET
                                else:
                                        attributes = pbs.new_attropl(1)
                                        attributes[0].name = pbs.ATTR_NODE_note
                                        attributes[0].op = pbs.SET
                        else:
                                attributes = attrs
                        # Some hacking here because some limitation in the Torque 2.4 version
                        # fetching note data first for all nodes!
                        tmp_node_note = dict()

                        for node in nodes:
                                if note_attributes and len( note_attributes ) == 3:
	                                    tmp_node_note[ node ] = self.note( node, note_attributes )

                        pbs_server = pbs.pbs_default()

                        if not pbs_server:
                                raise sara_nodesException, 'Default pbs server not found!'

                        pbs_connection = pbs.pbs_connect( pbs_server )
                        for node in nodes:
                                if note_attributes and len( note_attributes ) == 3:
                                        try:
                                                if attrs:
                                                        attributes[1].value = tmp_node_note[ node ]
                                                else:
                                                        attributes[0].value = tmp_node_note[ node ]
                                        except KeyError:
                                                pass
                                rcode = pbs.pbs_manager( pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node, attributes, 'NULL' )
                                if rcode > 0:
                                        errno, text = pbs.error()
                                        nodeserror.append( '%s: %s (%s)' % ( node, text, errno ) )
                else:
                        p = PBSQuery.PBSQuery()
                        pbsnodes = p.getnodes().keys()

                        print '%*s:' % ( 7, 'Nodes' ),
			firstitem = True

                        for node in nodes:
                                if node in pbsnodes:
					if firstitem:
	                                        print '%s' % node
						firstitem = False
					else:
						print '%*s' % ( 17, node )
                                else:
                                        nodeserror.append( '%s: does not exist' % node )

                if len( nodeserror ) > 0:
                        raise sara_nodesException, nodeserror
コード例 #26
0
ファイル: torque.py プロジェクト: liek51/civet
    def queue_job(self, batch_job):
        """
          queue a BatchJob.
          
          :param batch_job: description of the job to queue
        """

        # batch job names should be unique for civet pipelines because the
        # job name is used to name log files and other output
        # Civet generates unique names for each step, so this is just checking
        # for a programming error
        assert batch_job.name not in self._job_names

        if self.execution_log_dir:
            log_dir = self.execution_log_dir
        else:
            log_dir = self.log_dir

        # set batch_job.stderr_path and batch_job.stdout_path if they aren't already set
        if not batch_job.stdout_path:
            batch_job.stdout_path = os.path.join(log_dir,
                                                 batch_job.name + ".o")
        if not batch_job.stderr_path:
            batch_job.stderr_path = os.path.join(log_dir,
                                                 batch_job.name + ".e")

        # write batch script
        filename = self.write_script(batch_job)

        if self.submit:
            # build up our torque job attributes and resources
            job_attributes = {}
            job_resources = {
                'nodes': "{0}:ppn={1}".format(batch_job.nodes, batch_job.ppn),
                'walltime': batch_job.walltime,
                'epilogue': self.epilogue_filename
            }

            if batch_job.mem:
                job_resources['mem'] = batch_job.mem

            job_attributes[pbs.ATTR_v] = self.generate_env(batch_job.workdir)

            if batch_job.name:
                job_attributes[pbs.ATTR_N] = batch_job.name

            job_attributes[pbs.ATTR_o] = batch_job.stdout_path

            #XXX workaround for a TORQUE bug where local copies of stderr &
            # stdout files to /dev/null don't work correctly but remote
            # copies (to submit host) do
            if job_attributes[pbs.ATTR_o] == "/dev/null":
                job_attributes[
                    pbs.ATTR_o] = socket.gethostname() + ":/dev/null"

            job_attributes[pbs.ATTR_e] = batch_job.stderr_path

            #XXX workaround for a TORQUE bug where local copies of stderr &
            # stdout files to /dev/null don't work correctly but remote
            # copies (to submit host) do
            if job_attributes[pbs.ATTR_e] == "/dev/null":
                job_attributes[
                    pbs.ATTR_e] = socket.gethostname() + ":/dev/null"

            if batch_job.depends_on:
                job_attributes[pbs.ATTR_depend] = self._dependency_string(
                    batch_job)
            elif self.submit_with_hold:
                job_attributes[pbs.ATTR_h] = 'u'

            if batch_job.mail_option:
                job_attributes[pbs.ATTR_m] = batch_job.mail_option

            if batch_job.email_list:
                job_attributes[pbs.ATTR_M] = batch_job.email_list

            if batch_job.date_time:
                job_attributes[pbs.ATTR_a] = str(
                    int(time.mktime(batch_job.date_time.timetuple())))

            pbs_attrs = pbs.new_attropl(
                len(job_attributes) + len(job_resources))

            # populate pbs_attrs
            attr_idx = 0
            for resource, val in job_resources.iteritems():
                pbs_attrs[attr_idx].name = pbs.ATTR_l
                pbs_attrs[attr_idx].resource = resource
                pbs_attrs[attr_idx].value = val
                attr_idx += 1

            for attribute, val in job_attributes.iteritems():
                pbs_attrs[attr_idx].name = attribute
                pbs_attrs[attr_idx].value = val
                attr_idx += 1

            # we've initialized pbs_attrs with all the attributes we need to set
            # now we can connect to the server and submit the job
            connection = _connect_to_server(self._server)

            # connected to pbs_server

            # submit job
            retry = 0
            job_id = pbs.pbs_submit(connection, pbs_attrs, filename,
                                    self.queue, None)

            # if pbs.pbs_submit failed, try again
            while not job_id and retry < _MAX_RETRY:
                retry += 1
                print("Retrying connection...", file=sys.stderr)
                time.sleep(retry**2)
                job_id = pbs.pbs_submit(connection, pbs_attrs, filename,
                                        self.queue, None)

            pbs.pbs_disconnect(connection)

            # check to see if the job was submitted successfully.
            if not job_id:
                e, e_msg = pbs.error()
                # the batch system returned an error, throw exception
                raise Exception("Error submitting job.  "
                                "Torque error {0}: '{1}'".format(
                                    e, torque_strerror(e)))

            if self.submit_with_hold and not batch_job.depends_on:
                self.held_jobs.append(job_id)

        else:
            #self.submit is False, fake a job ID
            job_id = "{0}.civet".format(self._id_seq)
            self._id_seq += 1

        self._job_names.append(batch_job.name)

        self._id_log.write(
            job_id + '\t' + batch_job.name + '\t' +
            str(self._printable_dependencies(batch_job.depends_on)) + '\n')
        self._id_log.flush()
        return job_id
コード例 #27
0
    def submit(self, with_hold=False):
        """Submit the jobscript txt, set self.jobid"""
        txt = self.script
        self.log.debug("Going to submit script %s" % txt)

        # Build default pbs_attributes list
        pbs_attributes = pbs.new_attropl(1)
        pbs_attributes[0].name = pbs.ATTR_N  # Job_Name
        pbs_attributes[0].value = self.name

        # set resource requirements
        resourse_attributes = pbs.new_attropl(len(self.resources))
        idx = 0
        for k, v in self.resources.items():
            resourse_attributes[idx].name = pbs.ATTR_l  # Resource_List
            resourse_attributes[idx].resource = k
            resourse_attributes[idx].value = v
            idx += 1
        pbs_attributes.extend(resourse_attributes)

        # add job dependencies to attributes
        if self.deps:
            deps_attributes = pbs.new_attropl(1)
            deps_attributes[0].name = pbs.ATTR_depend
            deps_attributes[0].value = ",".join(["afterany:%s" % dep for dep in self.deps])
            pbs_attributes.extend(deps_attributes)
            self.log.debug("Job deps attributes: %s" % deps_attributes[0].value)

        # submit job with (user) hold if requested
        if with_hold:
            hold_attributes = pbs.new_attropl(1)
            hold_attributes[0].name = pbs.ATTR_h
            hold_attributes[0].value = pbs.USER_HOLD
            pbs_attributes.extend(hold_attributes)
            self.holds.append(pbs.USER_HOLD)
            self.log.debug("Job hold attributes: %s" % hold_attributes[0].value)

        # add a bunch of variables (added by qsub)
        # also set PBS_O_WORKDIR to os.getcwd()
        os.environ.setdefault('WORKDIR', os.getcwd())

        defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR']
        pbsvars = ["PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars]
        # extend PBS variables with specified variables
        pbsvars.extend(["%s=%s" % (name, value) for (name, value) in self.env_vars.items()])
        variable_attributes = pbs.new_attropl(1)
        variable_attributes[0].name = pbs.ATTR_v  # Variable_List
        variable_attributes[0].value = ",".join(pbsvars)

        pbs_attributes.extend(variable_attributes)
        self.log.debug("Job variable attributes: %s" % variable_attributes[0].value)

        # mail settings
        mail_attributes = pbs.new_attropl(1)
        mail_attributes[0].name = pbs.ATTR_m  # Mail_Points
        mail_attributes[0].value = 'n'  # disable all mail
        pbs_attributes.extend(mail_attributes)
        self.log.debug("Job mail attributes: %s" % mail_attributes[0].value)

        fh, scriptfn = tempfile.mkstemp()
        f = os.fdopen(fh, 'w')
        self.log.debug("Writing temporary job script to %s" % scriptfn)
        f.write(txt)
        f.close()

        self.log.debug("Going to submit to queue %s" % self.queue)

        # job submission sometimes fails without producing an error, e.g. when one of the dependency jobs has already finished
        # when that occurs, None will be returned by pbs_submit as job id
        jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, NULL)
        is_error, errormsg = pbs.error()
        if is_error or jobid is None:
            self.log.error("Failed to submit job script %s (job id: %s, error %s)" % (scriptfn, jobid, errormsg))
        else:
            self.log.debug("Succesful job submission returned jobid %s" % jobid)
            self.jobid = jobid
            os.remove(scriptfn)
コード例 #28
0
    def _submit(self):
        """Submit the jobscript txt, set self.jobid"""
        txt = self.script
        self.log.debug("Going to submit script %s" % txt)

        # Build default pbs_attributes list
        pbs_attributes = pbs.new_attropl(3)
        pbs_attributes[0].name = pbs.ATTR_N  # Job_Name
        pbs_attributes[0].value = self.name

        output_dir = build_option('job_output_dir')
        pbs_attributes[1].name = pbs.ATTR_o
        pbs_attributes[1].value = os.path.join(output_dir,
                                               '%s.o$PBS_JOBID' % self.name)

        pbs_attributes[2].name = pbs.ATTR_e
        pbs_attributes[2].value = os.path.join(output_dir,
                                               '%s.e$PBS_JOBID' % self.name)

        # set resource requirements
        resource_attributes = pbs.new_attropl(len(self.resources))
        idx = 0
        for k, v in self.resources.items():
            resource_attributes[idx].name = pbs.ATTR_l  # Resource_List
            resource_attributes[idx].resource = k
            resource_attributes[idx].value = v
            idx += 1
        pbs_attributes.extend(resource_attributes)

        # add job dependencies to attributes
        if self.deps:
            deps_attributes = pbs.new_attropl(1)
            deps_attributes[0].name = pbs.ATTR_depend
            deps_attributes[0].value = ",".join(
                ["afterany:%s" % dep.jobid for dep in self.deps])
            pbs_attributes.extend(deps_attributes)
            self.log.debug("Job deps attributes: %s" %
                           deps_attributes[0].value)

        # submit job with (user) hold
        hold_attributes = pbs.new_attropl(1)
        hold_attributes[0].name = pbs.ATTR_h
        hold_attributes[0].value = pbs.USER_HOLD
        pbs_attributes.extend(hold_attributes)
        self.holds.append(pbs.USER_HOLD)
        self.log.debug("Job hold attributes: %s" % hold_attributes[0].value)

        # add a bunch of variables (added by qsub)
        # also set PBS_O_WORKDIR to os.getcwd()
        os.environ.setdefault('WORKDIR', os.getcwd())

        defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR']
        pbsvars = [
            "PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x))
            for x in defvars
        ]
        # extend PBS variables with specified variables
        pbsvars.extend([
            "%s=%s" % (name, value) for (name, value) in self.env_vars.items()
        ])
        variable_attributes = pbs.new_attropl(1)
        variable_attributes[0].name = pbs.ATTR_v  # Variable_List
        variable_attributes[0].value = ",".join(pbsvars)

        pbs_attributes.extend(variable_attributes)
        self.log.debug("Job variable attributes: %s" %
                       variable_attributes[0].value)

        # mail settings
        mail_attributes = pbs.new_attropl(1)
        mail_attributes[0].name = pbs.ATTR_m  # Mail_Points
        mail_attributes[0].value = 'n'  # disable all mail
        pbs_attributes.extend(mail_attributes)
        self.log.debug("Job mail attributes: %s" % mail_attributes[0].value)

        fh, scriptfn = tempfile.mkstemp()
        f = os.fdopen(fh, 'w')
        self.log.debug("Writing temporary job script to %s" % scriptfn)
        f.write(txt)
        f.close()

        self.log.debug("Going to submit to queue %s" % self.queue)

        # job submission sometimes fails without producing an error, e.g. when one of the dependency jobs has already finished
        # when that occurs, None will be returned by pbs_submit as job id
        jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn,
                               self.queue, NULL)
        is_error, errormsg = pbs.error()
        if is_error or jobid is None:
            raise EasyBuildError(
                "Failed to submit job script %s (job id: %s, error %s)",
                scriptfn, jobid, errormsg)
        else:
            self.log.debug("Succesful job submission returned jobid %s" %
                           jobid)
            self.jobid = jobid
            os.remove(scriptfn)
コード例 #29
0
    def queue_job(self, job_wrapper):
        """Create PBS script for a job and submit it to the PBS queue"""
        # prepare the job
        if not self.prepare_job(
                job_wrapper,
                include_metadata=not (self.app.config.pbs_stage_path)):
            return

        job_destination = job_wrapper.job_destination

        # Determine the job's PBS destination (server/queue) and options from the job destination definition
        pbs_queue_name = None
        pbs_server_name = self.default_pbs_server
        pbs_options = []
        if '-q' in job_destination.params and 'destination' not in job_destination.params:
            job_destination.params['destination'] = job_destination.params.pop(
                '-q')
        if 'destination' in job_destination.params:
            if '@' in job_destination.params['destination']:
                # Destination includes a server
                pbs_queue_name, pbs_server_name = job_destination.params[
                    'destination'].split('@')
                if pbs_queue_name == '':
                    # e.g. `qsub -q @server`
                    pbs_queue_name = None
            else:
                # Destination is just a queue
                pbs_queue_name = job_destination.params['destination']
            job_destination.params.pop('destination')

        # Parse PBS params
        pbs_options = self.parse_destination_params(job_destination.params)

        # Explicitly set the determined PBS destination in the persisted job destination for recovery
        job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or
                                                           '', pbs_server_name)

        c = pbs.pbs_connect(util.smart_str(pbs_server_name))
        if c <= 0:
            errno, text = pbs.error()
            job_wrapper.fail(
                "Unable to queue job for execution.  Resubmitting the job may succeed."
            )
            log.error("Connection to PBS server for submit failed: %s: %s" %
                      (errno, text))
            return

        # define job attributes
        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory,
                             job_wrapper.job_id)
        efile = "%s/%s.e" % (self.app.config.cluster_files_directory,
                             job_wrapper.job_id)
        ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory,
                               job_wrapper.job_id)

        output_fnames = job_wrapper.get_output_fnames()

        # If an application server is set, we're staging
        if self.app.config.pbs_application_server:
            pbs_ofile = self.app.config.pbs_application_server + ':' + ofile
            pbs_efile = self.app.config.pbs_application_server + ':' + efile
            output_files = [str(o) for o in output_fnames]
            output_files.append(ecfile)
            stagein = self.get_stage_in_out(job_wrapper.get_input_fnames() +
                                            output_files,
                                            symlink=True)
            stageout = self.get_stage_in_out(output_files)
            attrs = [
                dict(name=pbs.ATTR_o, value=pbs_ofile),
                dict(name=pbs.ATTR_e, value=pbs_efile),
                dict(name=pbs.ATTR_stagein, value=stagein),
                dict(name=pbs.ATTR_stageout, value=stageout),
            ]
        # If not, we're using NFS
        else:
            attrs = [
                dict(name=pbs.ATTR_o, value=ofile),
                dict(name=pbs.ATTR_e, value=efile),
            ]

        # define PBS job options
        attrs.append(
            dict(name=pbs.ATTR_N,
                 value=str("%s_%s_%s" %
                           (job_wrapper.job_id, job_wrapper.tool.id,
                            job_wrapper.user))))
        job_attrs = pbs.new_attropl(len(attrs) + len(pbs_options))
        for i, attr in enumerate(attrs + pbs_options):
            job_attrs[i].name = attr['name']
            job_attrs[i].value = attr['value']
            if 'resource' in attr:
                job_attrs[i].resource = attr['resource']
        exec_dir = os.path.abspath(job_wrapper.working_directory)

        # write the job script
        if self.app.config.pbs_stage_path != '':
            # touch the ecfile so that it gets staged
            with file(ecfile, 'a'):
                os.utime(ecfile, None)

            stage_commands = pbs_symlink_template % (
                " ".join(job_wrapper.get_input_fnames() + output_files),
                self.app.config.pbs_stage_path,
                exec_dir,
            )
        else:
            stage_commands = ''

        env_setup_commands = [stage_commands]
        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ecfile,
                                   env_setup_commands=env_setup_commands)
        job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory,
                                 job_wrapper.job_id)
        self.write_executable_script(job_file, script)
        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug(
                "Job %s deleted by user before it entered the PBS queue" %
                job_wrapper.job_id)
            pbs.pbs_disconnect(c)
            if self.app.config.cleanup_job in ("always", "onsuccess"):
                self.cleanup((ofile, efile, ecfile, job_file))
                job_wrapper.cleanup()
            return

        # submit
        # The job tag includes the job and the task identifier
        # (if a TaskWrapper was passed in):
        galaxy_job_id = job_wrapper.get_id_tag()
        log.debug("(%s) submitting file %s" % (galaxy_job_id, job_file))

        tries = 0
        while tries < 5:
            job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name,
                                    None)
            tries += 1
            if job_id:
                pbs.pbs_disconnect(c)
                break
            errno, text = pbs.error()
            log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" %
                        (galaxy_job_id, tries, errno, text))
            time.sleep(2)
        else:
            log.error("(%s) All attempts to submit job failed" % galaxy_job_id)
            job_wrapper.fail(
                "Unable to run this job due to a cluster error, please retry it later"
            )
            return

        if pbs_queue_name is None:
            log.debug("(%s) queued in default queue as %s" %
                      (galaxy_job_id, job_id))
        else:
            log.debug("(%s) queued in %s queue as %s" %
                      (galaxy_job_id, pbs_queue_name, job_id))

        # persist destination
        job_wrapper.set_job_destination(job_destination, job_id)

        # Store PBS related state information for job
        job_state = AsynchronousJobState()
        job_state.job_wrapper = job_wrapper
        job_state.job_id = job_id
        job_state.job_file = job_file
        job_state.output_file = ofile
        job_state.error_file = efile
        job_state.exit_code_file = ecfile
        job_state.old_state = 'N'
        job_state.running = False
        job_state.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put(job_state)
コード例 #30
0
ファイル: pbsmon.py プロジェクト: jasonshih/pbs_python
def main():
    state_list = []
    node_list = []
    node_nr = 0

    if len(sys.argv) > 1:
        pbs_server = sys.argv[1]
    else:
        pbs_server = pbs.pbs_default()
        if not pbs_server:
            print "No default pbs server, usage: pbsmon [server] "
            sys.exit(1)

    con = pbs.pbs_connect(pbs_server)
    if con < 0:
        errno, text = pbs.error()
        print errno, text
        sys.exit(1)

    # We are only interested in the state and jobs of a node
    #
    attrl = pbs.new_attrl(2)
    attrl[0].name = "state"
    attrl[1].name = "jobs"

    nodes = pbs.pbs_statnode(con, "", attrl, "NULL")

    # Some is het None dan weer NULL, beats me
    #
    for node in nodes:

        # display_node_status(batch_info)
        node_attr = node.attribs

        # A node can have serveral states, huh. We are only
        # interested in first entry.
        #
        temp = string.splitfields(node_attr[0].value, ",")
        state = temp[0]

        # look if on a free node a job is scheduled, then mark it
        # as other state
        #
        if state == pbs.ND_free:
            if len([x for x in node_attr if x.name == "jobs"]):
                state_list.append(translate_state[pbs_ND_free_and_job])
            else:
                state_list.append(translate_state[state])
        else:
            state_list.append(translate_state[state])

        re_host = re.compile(
            r"""

      (?P<name>\d+)

      """,
            re.VERBOSE,
        )

        result = re_host.search(node.name)
        if result:
            node_list.append(result.group("name"))
        else:
            node_nr = node_nr + 1
            node_list.append(str(node_nr))

    display_cluster_status(node_list, state_list)
コード例 #31
0
ファイル: SchedulerPbs.py プロジェクト: PerilousApricot/CRAB2
    def submitJob ( self, conn, job, task=None, requirements=''):
        """ Need to copy the inputsandbox to WN before submitting a job"""

        # Write a temporary submit script
        # NB: we assume an env var PBS_JOBCOOKIE points to the exec dir on the batch host

        ifiles=task['globalSandbox'].split(',')

        f=tempfile.NamedTemporaryFile()
        s=[]
        s.append('#!/bin/sh');
        if self.workerNodeWorkDir:
            s.append('cd ' + self.workerNodeWorkDir)
        s.append('if [ ! -d $PBS_JOBCOOKIE ] ; then mkdir -p $PBS_JOBCOOKIE ; fi')
        s.append('cd $PBS_JOBCOOKIE')
        for ifile in task['globalSandbox'].split(','):
            s.append('cp '+ifile+' .')
        s.append(self.jobScriptDir + job['executable']+' '+ job['arguments'] +\
                 ' >' + job['standardOutput'] + ' 2>' + job['standardError'])
        if self.workerNodeWorkDir:
            s.append('cd ' + self.workerNodeWorkDir)
    
        # this fails if the job is aborted, which leaks disc space. Adding an epilogue to make
        # sure it's gone for good - AMM 18/07/2011
        s.append('rm -fr $PBS_JOBCOOKIE')
        f.write('\n'.join(s))
        f.flush()

        epilogue = tempfile.NamedTemporaryFile()
        s = []
        s.append('#!/bin/sh');
        if self.workerNodeWorkDir:
            s.append('cd ' + self.workerNodeWorkDir)
        s.append('rm -fr $PBS_JOBCOOKIE')
        s.append('touch $HOME/done.$1')
        epilogue.write( '\n'.join( s ) )
        epilogue.flush()
        os.chmod( epilogue.name, 700 )
        attr_dict={'Job_Name':'CRAB_PBS',
                   'Variable_List':self.pbs_env,
                   'Output_Path':self.jobResDir+'wrapper_'+str(job['standardOutput']),
                   'Error_Path':self.jobResDir+'wrapper_'+str(job['standardError'])
                   }

        attropl=pbs.new_attropl(len(attr_dict)+len(self.res_dict) + 1)
        i_attr=0
        for k in attr_dict.keys():
            self.logging.debug("adding k %s" % k)
            attropl[i_attr].name=k
            attropl[i_attr].value=attr_dict[k]
            i_attr+=1
        for k in self.res_dict.keys():
            attropl[i_attr].name='Resource_List'
            attropl[i_attr].resource=k
            attropl[i_attr].value=self.res_dict[k]
            i_attr+=1
    
        attropl[i_attr].name = 'Resource_List'
        attropl[i_attr].resource  = 'epilogue'
        attropl[i_attr].value = epilogue.name
        self.logging.debug("adding epilogue: %s" % epilogue.name)
        i_attr += 1

        jobid = pbs.pbs_submit(conn, attropl, f.name, self.queue, 'NULL')
        f.close()

        if not jobid:
            err, err_text=pbs.error()
            self.logging.error('Error in job submission')
            self.logging.error('PBS error code '+str(err)+': '+err_text)
            self.pbs_disconn(conn)
            raise SchedulerError('PBS error', str(err)+': '+err_text)
        
        return {job['name']:jobid}, None, None 
コード例 #32
0
    def submit(self):
        """Submit the jobscript txt, set self.jobid"""
        txt = self.script
        self.log.debug("Going to submit script %s" % txt)


        # Build default pbs_attributes list
        pbs_attributes = pbs.new_attropl(1)
        pbs_attributes[0].name = 'Job_Name'
        pbs_attributes[0].value = self.name


        # set resource requirements
        resourse_attributes = pbs.new_attropl(len(self.resources))
        idx = 0
        for k, v in self.resources.items():
            resourse_attributes[idx].name = 'Resource_List'
            resourse_attributes[idx].resource = k
            resourse_attributes[idx].value = v
            idx += 1
        pbs_attributes.extend(resourse_attributes)

        # add job dependencies to attributes
        if self.deps:
            deps_attributes = pbs.new_attropl(1)
            deps_attributes[0].name = pbs.ATTR_depend
            deps_attributes[0].value = ",".join(["afterany:%s" % dep for dep in self.deps])
            pbs_attributes.extend(deps_attributes)
            self.log.debug("Job deps attributes: %s" % deps_attributes[0].value)

        # add a bunch of variables (added by qsub)
        # also set PBS_O_WORKDIR to os.getcwd()
        os.environ.setdefault('WORKDIR', os.getcwd())

        defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR']
        pbsvars = ["PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars]
        # extend PBS variables with specified variables
        pbsvars.extend(["%s=%s" % (name, value) for (name, value) in self.env_vars.items()])
        variable_attributes = pbs.new_attropl(1)
        variable_attributes[0].name = 'Variable_List'
        variable_attributes[0].value = ",".join(pbsvars)

        pbs_attributes.extend(variable_attributes)
        self.log.debug("Job variable attributes: %s" % variable_attributes[0].value)

        # mail settings
        mail_attributes = pbs.new_attropl(1)
        mail_attributes[0].name = 'Mail_Points'
        mail_attributes[0].value = 'n'  # disable all mail
        pbs_attributes.extend(mail_attributes)
        self.log.debug("Job mail attributes: %s" % mail_attributes[0].value)

        import tempfile
        fh, scriptfn = tempfile.mkstemp()
        f = os.fdopen(fh, 'w')
        self.log.debug("Writing temporary job script to %s" % scriptfn)
        f.write(txt)
        f.close()

        self.log.debug("Going to submit to queue %s" % self.queue)

        # extend paramater should be 'NULL' because this is required by the python api
        extend = 'NULL'
        # job submission sometimes fails without producing an error, e.g. when one of the dependency jobs has already finished
        # when that occurs, None will be returned by pbs_submit as job id
        jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, extend)
        is_error, errormsg = pbs.error()
        if is_error or jobid is None:
            self.log.error("Failed to submit job script %s (job id: %s, error %s)" % (scriptfn, jobid, errormsg))
        else:
            self.log.debug("Succesful job submission returned jobid %s" % jobid)
            self.jobid = jobid
            os.remove(scriptfn)
コード例 #33
0
    def submit(self):
        """Submit the jobscript txt, set self.jobid"""
        txt = self.script
        self.log.debug("Going to submit script %s" % txt)


        # Build default pbs_attributes list
        pbs_attributes = pbs.new_attropl(1)
        pbs_attributes[0].name = 'Job_Name'
        pbs_attributes[0].value = self.name


        # set resource requirements
        resourse_attributes = pbs.new_attropl(len(self.resources))
        idx = 0
        for k, v in self.resources.items():
            resourse_attributes[idx].name = 'Resource_List'
            resourse_attributes[idx].resource = k
            resourse_attributes[idx].value = v
            idx += 1
        pbs_attributes.extend(resourse_attributes)

        # add job dependencies to attributes
        if self.deps:
            deps_attributes = pbs.new_attropl(1)
            deps_attributes[0].name = pbs.ATTR_depend
            deps_attributes[0].value = ",".join(["afterany:%s" % dep for dep in self.deps])
            pbs_attributes.extend(deps_attributes)
            self.log.debug("Job deps attributes: %s" % deps_attributes[0].value)

        # add a bunch of variables (added by qsub)
        # also set PBS_O_WORKDIR to os.getcwd()
        os.environ.setdefault('WORKDIR', os.getcwd())

        defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR']
        pbsvars = ["PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars]
        # extend PBS variables with specified variables
        pbsvars.extend(["%s=%s" % (name, value) for (name, value) in self.env_vars.items()])
        variable_attributes = pbs.new_attropl(1)
        variable_attributes[0].name = 'Variable_List'
        variable_attributes[0].value = ",".join(pbsvars)

        pbs_attributes.extend(variable_attributes)
        self.log.debug("Job variable attributes: %s" % variable_attributes[0].value)

        # mail settings
        mail_attributes = pbs.new_attropl(1)
        mail_attributes[0].name = 'Mail_Points'
        mail_attributes[0].value = 'n'  # disable all mail
        pbs_attributes.extend(mail_attributes)
        self.log.debug("Job mail attributes: %s" % mail_attributes[0].value)

        import tempfile
        fh, scriptfn = tempfile.mkstemp()
        f = os.fdopen(fh, 'w')
        self.log.debug("Writing temporary job script to %s" % scriptfn)
        f.write(txt)
        f.close()

        self.log.debug("Going to submit to queue %s" % self.queue)

        # extend paramater should be 'NULL' because this is required by the python api
        extend = 'NULL'
        jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, extend)

        is_error, errormsg = pbs.error()
        if is_error:
            self.log.error("Failed to submit job script %s: error %s" % (scriptfn, errormsg))
        else:
            self.log.debug("Succesful job submission returned jobid %s" % jobid)
            self.jobid = jobid
            os.remove(scriptfn)
コード例 #34
0
ファイル: pbsmon.py プロジェクト: pgruenbacher/pbs_python
def main():
    state_list = []
    node_list = []
    node_nr = 0

    if len(sys.argv) > 1:
        pbs_server = sys.argv[1]
    else:
        pbs_server = pbs.pbs_default()
        if not pbs_server:
            print "No default pbs server, usage: pbsmon [server] "
            sys.exit(1)

    con = pbs.pbs_connect(pbs_server)
    if con < 0:
        errno, text = pbs.error()
        print errno, text
        sys.exit(1)

    # We are only interested in the state and jobs of a node
    #
    attrl = pbs.new_attrl(2)
    attrl[0].name = 'state'
    attrl[1].name = 'jobs'

    nodes = pbs.pbs_statnode(con, "", attrl, "NULL")

    # Some is het None dan weer NULL, beats me
    #
    for node in nodes:

        # display_node_status(batch_info)
        node_attr = node.attribs

        # A node can have serveral states, huh. We are only
        # interested in first entry.
        #
        temp = string.splitfields(node_attr[0].value, ',')
        state = temp[0]

        # look if on a free node a job is scheduled, then mark it
        # as other state
        #
        if state == pbs.ND_free:
            if len([x for x in node_attr if x.name == 'jobs']):
                state_list.append(translate_state[pbs_ND_free_and_job])
            else:
                state_list.append(translate_state[state])
        else:
            state_list.append(translate_state[state])

        re_host = re.compile(r"""

      (?P<name>\d+)

      """, re.VERBOSE)

        result = re_host.search(node.name)
        if result:
            node_list.append(result.group('name'))
        else:
            node_nr = node_nr + 1
            node_list.append(str(node_nr))

    display_cluster_status(node_list, state_list)
コード例 #35
0
ファイル: pbs.py プロジェクト: AAFC-MBB/galaxy-1
 def check_watched_items( self ):
     """
     Called by the monitor thread to look at each watched job and deal
     with state changes.
     """
     new_watched = []
     # reduce pbs load by batching status queries
     ( failures, statuses ) = self.check_all_jobs()
     for pbs_job_state in self.watched:
         job_id = pbs_job_state.job_id
         galaxy_job_id = pbs_job_state.job_wrapper.get_id_tag()
         old_state = pbs_job_state.old_state
         pbs_server_name = self.__get_pbs_server(pbs_job_state.job_destination.params)
         if pbs_server_name in failures:
             log.debug( "(%s/%s) Skipping state check because PBS server connection failed" % ( galaxy_job_id, job_id ) )
             new_watched.append( pbs_job_state )
             continue
         try:
             status = statuses[job_id]
         except KeyError:
             if pbs_job_state.job_wrapper.get_state() == model.Job.states.DELETED:
                 continue
             try:
                 # Recheck to make sure it wasn't a communication problem
                 self.check_single_job( pbs_server_name, job_id )
                 log.warning( "(%s/%s) PBS job was not in state check list, but was found with individual state check" % ( galaxy_job_id, job_id ) )
                 new_watched.append( pbs_job_state )
             except:
                 errno, text = pbs.error()
                 if errno == 15001:
                     # 15001 == job not in queue
                     log.debug("(%s/%s) PBS job has left queue" % (galaxy_job_id, job_id) )
                     self.work_queue.put( ( self.finish_job, pbs_job_state ) )
                 else:
                     # Unhandled error, continue to monitor
                     log.info("(%s/%s) PBS state check resulted in error (%d): %s" % (galaxy_job_id, job_id, errno, text) )
                     new_watched.append( pbs_job_state )
             continue
         if status.job_state != old_state:
             log.debug("(%s/%s) PBS job state changed from %s to %s" % ( galaxy_job_id, job_id, old_state, status.job_state ) )
         if status.job_state == "R" and not pbs_job_state.running:
             pbs_job_state.running = True
             pbs_job_state.job_wrapper.change_state( model.Job.states.RUNNING )
         if status.job_state == "R" and status.get( 'resources_used', False ):
             # resources_used may not be in the status for new jobs
             h, m, s = [ int( i ) for i in status.resources_used.walltime.split( ':' ) ]
             runtime = timedelta( 0, s, 0, 0, m, h )
             if pbs_job_state.check_limits( runtime=runtime ):
                 self.work_queue.put( ( self.fail_job, pbs_job_state ) )
                 continue
         elif status.job_state == "C":
             # "keep_completed" is enabled in PBS, so try to check exit status
             try:
                 assert int( status.exit_status ) == 0
                 log.debug("(%s/%s) PBS job has completed successfully" % ( galaxy_job_id, job_id ) )
             except AssertionError:
                 exit_status = int( status.exit_status )
                 error_message = JOB_EXIT_STATUS.get( exit_status, 'Unknown error: %s' % status.exit_status )
                 pbs_job_state.fail_message = CLUSTER_ERROR_MESSAGE % error_message
                 log.error( '(%s/%s) PBS job failed: %s' % ( galaxy_job_id, job_id, error_message ) )
                 pbs_job_state.stop_job = False
                 self.work_queue.put( ( self.fail_job, pbs_job_state ) )
                 continue
             except AttributeError:
                 # No exit_status, can't verify proper completion so we just have to assume success.
                 log.debug("(%s/%s) PBS job has completed" % ( galaxy_job_id, job_id ) )
             self.work_queue.put( ( self.finish_job, pbs_job_state ) )
             continue
         pbs_job_state.old_state = status.job_state
         new_watched.append( pbs_job_state )
     # Replace the watch list with the updated version
     self.watched = new_watched
コード例 #36
0
    def submit(self, txt):
        """Submit the jobscript txt, set self.jobid"""
        self.log.debug("Going to submit script %s", txt)

        attropl = pbs.new_attropl(2)  # jobparams
        attropl[0].name = 'Job_Name'
        attropl[0].value = self.options.get('name', 'python_pbs_job')
        attropl[1].name = 'Rerunable'
        attropl[1].value = 'y'

        for arg in self.args.keys():
            tmp = self.args[arg]
            tmpattropl = pbs.new_attropl(len(tmp))  # jobparams
            if arg in ('resources',):
                idx = 0
                for k, v in tmp.items():
                    tmpattropl[idx].name = 'Resource_List'  # resources
                    tmpattropl[idx].resource = k
                    tmpattropl[idx].value = v
                    idx += 1
            elif arg in ('mail',):
                tmpattropl[0].name = 'Mail_Points'
                tmpattropl[0].value = tmp['send']
                if len(tmp) > 1:
                    tmpattropl[0].name = "Mail_Users"
                    tmpattropl[0].value = tmp['others']
            elif arg in ('queue',):
                # # use destination field of pbs_submit
                pass
            elif arg in ('account',):
                tmpattropl = pbs.new_attropl(1)
                tmpattropl[0].name = pbs.ATTR_A 
                tmpattropl[0].value = tmp
                #continue 
            else:
                self.log.error('Unknown arg %s', arg)
                tmpattropl = pbs.new_attropl(0)

            attropl.extend(tmpattropl)


        # add a bunch of variables (added by qsub)
        # also set PBS_O_WORKDIR to os.getcwd()
        os.environ.setdefault('WORKDIR', os.getcwd())

        defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR']

        tmpattropl = pbs.new_attropl(1)
        tmpattropl[0].name = 'Variable_List'
        tmpattropl[0].value = ",".join(["PBS_O_%s=%s" % (
            x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars])
        attropl.extend(tmpattropl)

        fh, scriptfn = tempfile.mkstemp()
        f = os.fdopen(fh, 'w')
        self.log.debug("Writing temp jobscript to %s" % scriptfn)
        f.write(txt)
        f.close()

        queue = self.args.get(
            'queue', self.options.get('queue', ''))  # do not set with attropl
        if queue:
            self.log.debug("Going to submit to queue %s", queue)
        else:
            self.log.debug("No queue specified. Will submit to default destination.")

        extend = 'NULL'  # always

        jobid = pbs.pbs_submit(self.pbsconn, attropl, scriptfn, queue, extend)

        is_error, errormsg = pbs.error()
        if is_error:
            self.log.error("Failed to submit job script %s: error %s",
                           scriptfn, errormsg)
        else:
            self.log.debug("Succesful jobsubmission returned jobid %s", jobid)
            self.jobid = jobid
            os.remove(scriptfn)
コード例 #37
0
ファイル: pbsmake.py プロジェクト: cwvh/pbsmake
    def submit_pbs(self, name, taskfile, lastid=None):
        targets = self.targets
        target = targets[name]
        subenv = target["env"].asdict()

        target["attrs"].setdefault(pbs.ATTR_N, name)

        # Just include all variables by default
        varlist = ",".join("%s=%s" % (k, v) for k, v in subenv.iteritems())
        target["attrs"].setdefault(pbs.ATTR_v, varlist)

        # Track job dependencies
        dependencies = []
        dep_type = name.partition("::")[-1] or "afterok"
        for dep in target["components"]:
            dependencies.append("%s:%s" % (dep_type, targets[dep]["torqueid"]))
        if lastid:
            dependencies.append("%s:%s" % (dep_type, lastid))
        if dependencies:
            target["attrs"][pbs.ATTR_depend] = ",".join(dependencies)

        # /bin/sh as a default shell will generally do the right thing.
        # It honors #! syntax at the beginning of the file and it
        # interprets basic commands without a #! at the beginning of
        # the file. Obscure users can opt for other shells
        # (eg: bash,csh,ksh,python,...) via the standard #! syntax
        #   -- This default ensures users with non-standard shells
        #      can still use pbsmake files from other users.
        target["attrs"].setdefault(pbs.ATTR_S, "/bin/sh")

        # We need to handle ATTR_l specially. Each resource needs its own
        # attropl with the name pbs.ATTR_l:
        attr_l = []
        if pbs.ATTR_l in target["attrs"]:
            attr_l = target["attrs"][pbs.ATTR_l].split(",")
            del (target["attrs"][pbs.ATTR_l])

        # Attach attributes to job as the pbs module expects it
        attropl = pbs.new_attropl(len(target["attrs"]) + len(attr_l))
        i = 0
        for n in target["attrs"]:
            attropl[i].name = n
            attropl[i].value = target["env"].interp(target["attrs"][n], defer=False)
            i += 1
        for n in attr_l:
            attropl[i].name = pbs.ATTR_l
            res, val = n.split("=", 1)
            attropl[i].resource = res
            attropl[i].value = target["env"].interp(val, defer=False)
            i += 1
        try:
            destination = target["attrs"]["queue"]
        except KeyError:
            destination = ""

        # attempt to submit job
        lastid = pbs.pbs_submit(self.conn, attropl, taskfile.name, destination, "")
        if lastid:
            target["torqueid"] = lastid
        else:
            print "Error submitting job: %s\n\tAttributes:" % name
            for attr, val in target["attrs"].items():
                print "\t\t%s: %s" % (attr, val)
            raise Exception(pbs.error())

        return lastid
コード例 #38
0
 def check_watched_items(self):
     """
     Called by the monitor thread to look at each watched job and deal
     with state changes.
     """
     new_watched = []
     # reduce pbs load by batching status queries
     (failures, statuses) = self.check_all_jobs()
     for pbs_job_state in self.watched:
         job_id = pbs_job_state.job_id
         galaxy_job_id = pbs_job_state.job_wrapper.get_id_tag()
         old_state = pbs_job_state.old_state
         pbs_server_name = self.__get_pbs_server(
             pbs_job_state.job_destination.params)
         if pbs_server_name in failures:
             log.debug(
                 "(%s/%s) Skipping state check because PBS server connection failed"
                 % (galaxy_job_id, job_id))
             new_watched.append(pbs_job_state)
             continue
         try:
             status = statuses[job_id]
         except KeyError:
             if pbs_job_state.job_wrapper.get_state(
             ) == model.Job.states.DELETED:
                 continue
             try:
                 # Recheck to make sure it wasn't a communication problem
                 self.check_single_job(pbs_server_name, job_id)
                 log.warning(
                     "(%s/%s) PBS job was not in state check list, but was found with individual state check"
                     % (galaxy_job_id, job_id))
                 new_watched.append(pbs_job_state)
             except:
                 errno, text = pbs.error()
                 if errno == 15001:
                     # 15001 == job not in queue
                     log.debug("(%s/%s) PBS job has left queue" %
                               (galaxy_job_id, job_id))
                     self.work_queue.put((self.finish_job, pbs_job_state))
                 else:
                     # Unhandled error, continue to monitor
                     log.info(
                         "(%s/%s) PBS state check resulted in error (%d): %s"
                         % (galaxy_job_id, job_id, errno, text))
                     new_watched.append(pbs_job_state)
             continue
         if status.job_state != old_state:
             log.debug("(%s/%s) PBS job state changed from %s to %s" %
                       (galaxy_job_id, job_id, old_state, status.job_state))
         if status.job_state == "R" and not pbs_job_state.running:
             pbs_job_state.running = True
             pbs_job_state.job_wrapper.change_state(
                 model.Job.states.RUNNING)
         if status.job_state == "R" and status.get('resources_used', False):
             # resources_used may not be in the status for new jobs
             h, m, s = [
                 int(i) for i in status.resources_used.walltime.split(':')
             ]
             runtime = timedelta(0, s, 0, 0, m, h)
             if pbs_job_state.check_limits(runtime=runtime):
                 self.work_queue.put((self.fail_job, pbs_job_state))
                 continue
         elif status.job_state == "C":
             # "keep_completed" is enabled in PBS, so try to check exit status
             try:
                 assert int(status.exit_status) == 0
                 log.debug("(%s/%s) PBS job has completed successfully" %
                           (galaxy_job_id, job_id))
             except AssertionError:
                 exit_status = int(status.exit_status)
                 error_message = JOB_EXIT_STATUS.get(
                     exit_status, 'Unknown error: %s' % status.exit_status)
                 pbs_job_state.fail_message = CLUSTER_ERROR_MESSAGE % error_message
                 log.error('(%s/%s) PBS job failed: %s' %
                           (galaxy_job_id, job_id, error_message))
                 pbs_job_state.stop_job = False
                 self.work_queue.put((self.fail_job, pbs_job_state))
                 continue
             except AttributeError:
                 # No exit_status, can't verify proper completion so we just have to assume success.
                 log.debug("(%s/%s) PBS job has completed" %
                           (galaxy_job_id, job_id))
             self.work_queue.put((self.finish_job, pbs_job_state))
             continue
         pbs_job_state.old_state = status.job_state
         new_watched.append(pbs_job_state)
     # Replace the watch list with the updated version
     self.watched = new_watched
コード例 #39
0
def pp_predict_motifs(fastafile, outfile, analysis="small", organism="hg18", single=False, background="", tools=None, job_server="", ncpus=8, logger=None, max_time=None, fg_file=None, bg_file=None):
    if tools is None:
        tools = {}

    config = MotifConfig()

    if not tools:
        tools = dict([(x,1) for x in config.get_default_params["tools"].split(",")])
    
    #logger = logging.getLogger('prediction.pp_predict_motifs')

    wmin = 5 
    step = 1
    if analysis in ["large","xl"]:
        step = 2
        wmin = 6
    
    analysis_max = {"xs":5,"small":8, "medium":10,"large":14, "xl":20}
    wmax = analysis_max[analysis]

    if analysis == "xs":
        sys.stderr.write("Setting analysis xs to small")
        analysis = "small"

    jobs = {}
    
    result = PredictionResult(outfile, logger=logger, fg_file=fg_file, bg_file=bg_file)
    
    # Dynamically load all tools
    toolio = [x[1]() for x in inspect.getmembers(
                                                tool_classes, 
                                                lambda x: 
                                                        inspect.isclass(x) and 
                                                        issubclass(x, tool_classes.MotifProgram)
                                                ) if x[0] != 'MotifProgram']
    
    # TODO:
    # Add warnings for running time: Weeder GADEM
        
    # Prepare PBS submission
    server = pbs.pbs_default()
    c = pbs.pbs_connect(server)
    q = PBSQuery()
    attropl = pbs.new_attropl(6)
    # Name
    attropl[0].name  = pbs.ATTR_N
    # Restartable
    attropl[1].name  = pbs.ATTR_r
    attropl[1].value = 'y'
    # Walltime
    attropl[2].name  = pbs.ATTR_l
    attropl[2].resource = 'walltime'
    attropl[2].value = '600'
    # Node requirements
    attropl[3].name  = pbs.ATTR_l
    attropl[3].resource = 'nodes'
    attropl[3].value = '1:ppn=1'   # 
    attropl[4].name  = pbs.ATTR_o
    attropl[5].name  = pbs.ATTR_e
   
    rundir = os.path.join(os.path.split(os.path.abspath(fastafile))[0], "torque")
    if not os.path.exists(rundir):
        os.mkdir(rundir)

    params = {
              'analysis': analysis, 
              'background':background, 
              "single":single, 
              "organism":organism
              }
    
    jobs = {}
    for t in toolio:
        if tools.has_key(t.name) and tools[t.name]:
            if t.use_width:
                for i in range(wmin, wmax + 1, step):
                    logger.info("Starting %s job, width %s" % (t.name, i))
                    params['width'] = i
                    sh = write_shell_script(t.name, fastafile, rundir=rundir, params=params)
                    job_name = os.path.basename(os.path.splitext(sh)[0]) 
                    # submit
                    attropl[0].value = job_name
                    attropl[4].value = "{0}/{1}.stdout".format(rundir, job_name)
                    attropl[5].value = "{0}/{1}.stderr".format(rundir, job_name)
                    job_id = pbs.pbs_submit(c, attropl, sh, "batchq", 'NULL')
                    e, e_txt = pbs.error()
                    if e:
                        logger.error("Failed: {0}".format(e_txt))
                    else:
                        jobs[job_id] = job_name
            else:
                logger.debug("Starting %s job" % t.name)
                sh = write_shell_script(t.name, fastafile, rundir=rundir, params=params)
                job_name = os.path.basename(os.path.splitext(sh)[0]) 
                # submit
                attropl[0].value = job_name
                attropl[4].value = "{0}/{1}.stdout".format(rundir, job_name)
                attropl[5].value = "{0}/{1}.stderr".format(rundir, job_name)
                job_id = pbs.pbs_submit(c, attropl, sh, "batchq", 'NULL')
                e, e_txt = pbs.error()
                if e:
                    logger.error("Failed submission: {0}".format(e_txt))
                else:
                    jobs[job_id] = job_name
        else:
            logger.debug("Skipping %s" % t.name)
    
    ### Wait until all jobs are finished or the time runs out ###
    start_time = time()  
    try:
        # Run until all jobs are finished
        while len(jobs) > 0 and not(max_time) or time() - start_time < max_time:
            for job_id,job_name in jobs.items():
                job = q.getjob(job_id)
                
                if not job: # or not job.is_running():
                    motifs = []
                    if job:
                        name = job['Job_Name']
                        # Some error checking here!
                    else:
                        pwmfile = os.path.join(rundir, "{0}.pwm".format(job_name))
                        if os.path.exists(pwmfile):
                            motifs = read_motifs(open(pwmfile), fmt="pwm")
                        else:
                            logger.error("Job {0} finished, but couldn find {1}!".format(job_name, pwmfile))
                    stdout = open(os.path.join(rundir, "{0}.stdout".format(job_name))).read()
                    stderr = open(os.path.join(rundir, "{0}.stderr".format(job_name))).read()
                    
                    result.add_motifs(job_id, (motifs, stdout, stderr))
                    #for fname in glob.glob("{0}*".format(job_name)):
                    #    logger.debug("Deleting {0}".format(fname))
                    #    #os.unlink(fname)
                    
                    del jobs[job_id]
            sleep(5)

    ### Or the user gets impatient... ###
    except KeyboardInterrupt, e:
        # Destroy all running jobs
        logger.info("Caught interrupt, destroying all running jobs")
コード例 #40
0
def pbsmon():
    global NODES_PER_RACK, N_RACKS, PBS_STATES

    if len(sys.argv) > 1:
        pbs_server = sys.argv[1]
    else:
        pbs_server = pbs.pbs_default()

    if not pbs_server:
        print "No default pbs server, usage: %s [server]" % os.path.basename(sys.argv[0])
        sys.exit(1)

    con = pbs.pbs_connect(pbs_server)
    if con < 0:
        errno, text = pbs.error()
        print errno, text
        sys.exit(1)

    # get the state of the nodes
    attrl = pbs.new_attrl(2)
    attrl[0].name = "state"
    attrl[1].name = "jobs"
    nodes = pbs.pbs_statnode(con, "", attrl, "NULL")

    node_dict = {}

    count_states = {}
    for key in PBS_STATES.keys():
        count_states[key] = 0

    for node in nodes:
        node_attr = node.attribs
        temp = string.split(node_attr[0].value, ",")
        state = temp[0]
        state_char = PBS_STATES[state]
        count_states[state] = count_states[state] + 1

        if state == pbs.ND_free:
            if len(node_attr) > 1:
                # 				print 'TD: %s' % node.name, node_attr[1]
                state_char = PBS_STATES[pbs_ND_single]
                count_states[pbs.ND_free] = count_states[pbs.ND_free] - 1
                count_states[pbs_ND_single] = count_states[pbs_ND_single] + 1

        # 		print 'TD: %s %s' % (node.name, state_char)
        node_dict[node.name] = state_char

    legend = PBS_STATES.keys()
    legend.sort()

    # print nodes with gb-r%dn%d naming scheme
    print "  ",
    for rack in xrange(1, N_RACKS + 1):
        print "%2d" % rack,
    print

    for node_nr in xrange(1, NODES_PER_RACK + 1):
        print "%2d" % node_nr,

        for rack in xrange(1, N_RACKS + 1):
            node_name = "gb-r%dn%d" % (rack, node_nr)

            if node_dict.has_key(node_name):
                print " %s" % node_dict[node_name],

                del node_dict[node_name]
            else:
                print "  ",

        if node_nr - 1 < len(legend):
            state = legend[node_nr - 1]
            print "  %s  %-13s : %d" % (PBS_STATES[state], state, count_states[state])
        else:
            print

    print

    # any other nodes?
    arr = node_dict.keys()
    if arr:
        arr.sort()

        for node in arr:
            print "%s %s" % (node, node_dict[node])

        print
コード例 #41
0
    def submit(self):
 
 
        attropl = pbs.new_attropl(self.attribute_count + 1)
        attropl_idx = 0
 
        attropl[attropl_idx].name  = pbs.ATTR_v
        attropl[attropl_idx].value = self.generate_env()
        attropl_idx += 1
 
        if self.name:
            attropl[attropl_idx].name   = pbs.ATTR_N
            attropl[attropl_idx].value  = self.name
            attropl_idx += 1
           
        if self.walltime:
            attropl[attropl_idx].name     = pbs.ATTR_l
            attropl[attropl_idx].resource = 'walltime'
            attropl[attropl_idx].value    = self.walltime
            attropl_idx += 1
        
        if self.nodes:
            attropl[attropl_idx].name     = pbs.ATTR_l
            attropl[attropl_idx].resource = 'nodes'
            attropl[attropl_idx].value    = self.nodes
            attropl_idx += 1
           
        if self.stdout_path:
            attropl[attropl_idx].name  = pbs.ATTR_o
            attropl[attropl_idx].value = self.stdout_path
            attropl_idx += 1

        if self.stderr_path:
            attropl[attropl_idx].name  = pbs.ATTR_o
            attropl[attropl_idx].value = self.stderr_path
            attropl_idx += 1
           
        if self.dependency_list:
            attropl[attropl_idx].name = pbs.ATTR_depend
            attropl[attropl_idx].value = self.dependency_list
            attropl_idx += 1
           
        if self.mail_options:
            attropl[attropl_idx].name = pbs.ATTR_m
            attropl[attropl_idx].value = self.mail_options
            attropl_idx += 1
           
        if self.mem:
            attropl[attropl_idx].name     = pbs.ATTR_l
            attropl[attropl_idx].resource = 'mem'
            attropl[attropl_idx].value    = self.mem
            attropl_idx += 1
            
        if self.vmem:
            attropl[attropl_idx].name     = pbs.ATTR_l
            attropl[attropl_idx].resource = 'vmem'
            attropl[attropl_idx].value    = self.vmem
            attropl_idx += 1
            
        connection = pbs.pbs_connect(pbs.pbs_default())
        
        self.job_id = pbs.pbs_submit(connection, attropl, self.job_script, None, None)
       
        pbs.pbs_disconnect(connection)
        
        e, e_msg = pbs.error()
        
        # the batch system returned an error, throw exception 
        if e:
            message = "%d: %s" % (e, e_msg)
            raise Exception(message)
            
        return self.job_id
コード例 #42
0
                    attropl[index].name = getattr(pbs, 'ATTR_' + attr)
                    attropl[index].resource = resource
                    attropl[index].value = value
                    index += 1
            else:
                attropl[index].name = getattr(pbs, 'ATTR_' + attr)
                attropl[index].value = value
                index += 1
        return attropl


if __name__ == '__main__':
    jp = JobParser()

    try:
        jp.read(sys.argv[1])
    except IndexError:
        print("Usage: %s <jobscript>" % (sys.argv[0]))
        sys.exit(1)

    server_name = pbs.pbs_default()
    con = pbs.pbs_connect(server_name)
    job_id = pbs.pbs_submit(con, jp.get_attropl(), sys.argv[1], 'batch',
                            'NULL')

    e, e_txt = pbs.error()
    if e:
        print(e, e_txt)
    else:
        print(job_id)
コード例 #43
0
def pbsmon():
    global NODES_PER_RACK, N_RACKS, PBS_STATES

    if len(sys.argv) > 1:
        pbs_server = sys.argv[1]
    else:
        pbs_server = pbs.pbs_default()

    if not pbs_server:
        print 'No default pbs server, usage: %s [server]' % os.path.basename(
            sys.argv[0])
        sys.exit(1)

    con = pbs.pbs_connect(pbs_server)
    if con < 0:
        errno, text = pbs.error()
        print errno, text
        sys.exit(1)

# get the state of the nodes
    attrl = pbs.new_attrl(2)
    attrl[0].name = 'state'
    attrl[1].name = 'jobs'
    nodes = pbs.pbs_statnode(con, '', attrl, 'NULL')

    node_dict = {}

    count_states = {}
    for key in PBS_STATES.keys():
        count_states[key] = 0

    for node in nodes:
        node_attr = node.attribs
        temp = string.split(node_attr[0].value, ',')
        state = temp[0]
        state_char = PBS_STATES[state]
        count_states[state] = count_states[state] + 1

        if state == pbs.ND_free:
            if len(node_attr) > 1:
                #				print 'TD: %s' % node.name, node_attr[1]
                state_char = PBS_STATES[pbs_ND_single]
                count_states[pbs.ND_free] = count_states[pbs.ND_free] - 1
                count_states[pbs_ND_single] = count_states[pbs_ND_single] + 1

#		print 'TD: %s %s' % (node.name, state_char)
        node_dict[node.name] = state_char

    legend = PBS_STATES.keys()
    legend.sort()

    # print nodes with gb-r%dn%d naming scheme
    print '  ',
    for rack in xrange(1, N_RACKS + 1):
        print '%2d' % rack,
    print

    for node_nr in xrange(1, NODES_PER_RACK + 1):
        print '%2d' % node_nr,

        for rack in xrange(1, N_RACKS + 1):
            node_name = 'gb-r%dn%d' % (rack, node_nr)

            if node_dict.has_key(node_name):
                print ' %s' % node_dict[node_name],

                del node_dict[node_name]
            else:
                print '  ',

        if node_nr - 1 < len(legend):
            state = legend[node_nr - 1]
            print '  %s  %-13s : %d' % (PBS_STATES[state], state,
                                        count_states[state])
        else:
            print

    print

    # any other nodes?
    arr = node_dict.keys()
    if arr:
        arr.sort()

        for node in arr:
            print '%s %s' % (node, node_dict[node])

        print
コード例 #44
0
        def pbs_batch( self, nodes, attrs=None, note_attributes=None ):
                nodeserror = list()
                if not attrs and not note_attributes:
                        raise sara_nodesException, 'attrs and note_attributes can not be empty together!'

                if not self.dryrun:
                        if note_attributes and len( note_attributes ) == 3:
                                if attrs:
                                        attributes = attrs + pbs.new_attropl(1)
                                        attributes[1].name = pbs.ATTR_NODE_note
                                        attributes[1].op = pbs.SET
                                else:
                                        attributes = pbs.new_attropl(1)
                                        attributes[0].name = pbs.ATTR_NODE_note
                                        attributes[0].op = pbs.SET
                        else:
                                attributes = attrs
                        # Some hacking here because some limitation in the Torque 2.4 version
                        # fetching note data first for all nodes!
                        tmp_node_note = dict()

                        for node in nodes:
                                if note_attributes and len( note_attributes ) == 3:
                                        tmp_node_note[ node ] = self.note( node, note_attributes )

                        pbs_server = pbs.pbs_default()

                        if not pbs_server:
                                raise sara_nodesException, 'Default pbs server not found!'

                        pbs_connection = pbs.pbs_connect( pbs_server )
                        for node in nodes:
                                if note_attributes and len( note_attributes ) == 3:
                                        try:
                                                if attrs:
                                                        attributes[1].value = tmp_node_note[ node ]
                                                else:
                                                        attributes[0].value = tmp_node_note[ node ]
                                        except KeyError:
                                                pass
                                rcode = pbs.pbs_manager( pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node, attributes, 'NULL' )
                                if rcode > 0:
                                        errno, text = pbs.error()
                                        nodeserror.append( '%s: %s (%s)' % ( node, text, errno ) )
                else:
                        p = PBSQuery.PBSQuery()
                        pbsnodes = p.getnodes().keys()

                        print '%*s:' % ( 7, 'Nodes' ),
                        firstitem = True

                        for node in nodes:

                                if node in pbsnodes:
                                        if firstitem:
                                                print '%s' % node
                                                firstitem = False
                                        else:
                                                print '%*s' % ( 17, node )
                                else:
                                        nodeserror.append( '%s: does not exist' % node )

                if len( nodeserror ) > 0:
                        raise sara_nodesException, nodeserror
コード例 #45
0
ファイル: SchedulerPbs.py プロジェクト: dmwm/ProdCommon
    def submitJob(self, conn, job, task=None, requirements=''):
        """ Need to copy the inputsandbox to WN before submitting a job"""

        # Write a temporary submit script
        # NB: we assume an env var PBS_JOBCOOKIE points to the exec dir on the batch host

        ifiles = task['globalSandbox'].split(',')

        f = tempfile.NamedTemporaryFile()
        s = []
        s.append('#!/bin/sh')
        if self.workerNodeWorkDir:
            s.append('cd ' + self.workerNodeWorkDir)
        s.append(
            'if [ ! -d $PBS_JOBCOOKIE ] ; then mkdir -p $PBS_JOBCOOKIE ; fi')
        s.append('cd $PBS_JOBCOOKIE')
        for ifile in task['globalSandbox'].split(','):
            s.append('cp ' + ifile + ' .')
        s.append(self.jobScriptDir + job['executable']+' '+ job['arguments'] +\
                 ' >' + job['standardOutput'] + ' 2>' + job['standardError'])
        if self.workerNodeWorkDir:
            s.append('cd ' + self.workerNodeWorkDir)

        # this fails if the job is aborted, which leaks disc space. Adding an epilogue to make
        # sure it's gone for good - AMM 18/07/2011
        s.append('rm -fr $PBS_JOBCOOKIE')
        f.write('\n'.join(s))
        f.flush()

        epilogue = tempfile.NamedTemporaryFile()
        s = []
        s.append('#!/bin/sh')
        if self.workerNodeWorkDir:
            s.append('cd ' + self.workerNodeWorkDir)
        s.append('rm -fr $PBS_JOBCOOKIE')
        s.append('touch $HOME/done.$1')
        epilogue.write('\n'.join(s))
        epilogue.flush()
        os.chmod(epilogue.name, 700)
        attr_dict = {
            'Job_Name': 'CRAB_PBS',
            'Variable_List': self.pbs_env,
            'Output_Path':
            self.jobResDir + 'wrapper_' + str(job['standardOutput']),
            'Error_Path':
            self.jobResDir + 'wrapper_' + str(job['standardError'])
        }

        attropl = pbs.new_attropl(len(attr_dict) + len(self.res_dict) + 1)
        i_attr = 0
        for k in attr_dict.keys():
            self.logging.debug("adding k %s" % k)
            attropl[i_attr].name = k
            attropl[i_attr].value = attr_dict[k]
            i_attr += 1
        for k in self.res_dict.keys():
            attropl[i_attr].name = 'Resource_List'
            attropl[i_attr].resource = k
            attropl[i_attr].value = self.res_dict[k]
            i_attr += 1

        attropl[i_attr].name = 'Resource_List'
        attropl[i_attr].resource = 'epilogue'
        attropl[i_attr].value = epilogue.name
        self.logging.debug("adding epilogue: %s" % epilogue.name)
        i_attr += 1

        jobid = pbs.pbs_submit(conn, attropl, f.name, self.queue, 'NULL')
        f.close()

        if not jobid:
            err, err_text = pbs.error()
            self.logging.error('Error in job submission')
            self.logging.error('PBS error code ' + str(err) + ': ' + err_text)
            self.pbs_disconn(conn)
            raise SchedulerError('PBS error', str(err) + ': ' + err_text)

        return {job['name']: jobid}, None, None