def set_hold(self, hold_type=None): """Set hold on job of specified type.""" # we can't set this default for hold_type in function signature, # because we need to be able to load this module even when the pbs module is not available if hold_type is None: hold_type = pbs.USER_HOLD # only set hold if it wasn't set before if hold_type not in self.holds: if hold_type not in KNOWN_HOLD_TYPES: self.log.error( "set_hold: unknown hold type: %s (supported: %s)" % (hold_type, KNOWN_HOLD_TYPES)) # set hold, check for errors, and keep track of this hold ec = pbs.pbs_holdjob(self.pbsconn, self.jobid, hold_type, NULL) is_error, errormsg = pbs.error() if is_error or ec: tup = (hold_type, self.jobid, is_error, ec, errormsg) self.log.error( "Failed to set hold of type %s on job %s (is_error: %s, exit code: %s, msg: %s)" % tup) else: self.holds.append(hold_type) else: self.log.warning("Hold type %s was already set for %s" % (hold_type, self.jobid))
def main(): pbs_server = pbs.pbs_default() if not pbs_server: print 'No default server' sys.exit(1) if len(sys.argv) < 2: print "Usage: set_property.py <hostname>" sys.exit(1) hostname = sys.argv[1] con = pbs.pbs_connect(pbs_server) attrop_l = pbs.new_attropl(1) attrop_l[0].name = 'note' attrop_l[0].value = 'set_something_useful' attrop_l[0].op = pbs.SET r = pbs.pbs_manager(con, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, hostname, attrop_l, 'NULL') if r > 0: print r, ";" errno, text = pbs.error() print errno, text
def release_hold(self, hold_type=None): """Release hold on job of specified type.""" # we can't set this default for hold_type in function signature, # because we need to be able to load this module even when the pbs module is not available if hold_type is None: hold_type = pbs.USER_HOLD # only release hold if it was set if hold_type in self.holds: if hold_type not in KNOWN_HOLD_TYPES: raise EasyBuildError( "release_hold: unknown hold type: %s (supported: %s)", hold_type, KNOWN_HOLD_TYPES) # release hold, check for errors, remove from list of holds ec = pbs.pbs_rlsjob(self.pbsconn, self.jobid, hold_type, NULL) self.log.debug("Released hold of type %s for job %s" % (hold_type, self.jobid)) is_error, errormsg = pbs.error() if is_error or ec: raise EasyBuildError( "Failed to release hold type %s on job %s (is_error: %s, exit code: %s, msg: %s)", hold_type, self.jobid, is_error, ec, errormsg) else: self.holds.remove(hold_type) else: self.log.warning( "No hold type %s was set for %s, so skipping hold release" % (hold_type, self.jobid))
def _connect_to_server(server=None): """ open a connection to a pbs_server at hostname server, if server is None then connect to the default server. This function is shared between JobManager and TorqueJobRunner """ server_name = server if server else pbs.pbs_default() retry = 0 connection = pbs.pbs_connect(server_name) while connection <= 0 and retry < _MAX_RETRY: retry += 1 time.sleep(retry**2) connection = pbs.pbs_connect(server_name) if connection <= 0: e, e_msg = pbs.error() # the batch system returned an error, throw exception raise Exception("Error connecting to pbs_server. " "Torque error {0}: '{1}'".format( e, torque_strerror(e))) return connection
def submit_with_retry(pbs_attrs, script_path, queue, pbs_server=None): # connect to pbs server connection = _connect_to_server(pbs_server) # submit job retry = 0 job_id = pbs.pbs_submit(connection, pbs_attrs, script_path, queue, None) # if pbs.pbs_submit failed, try again while not job_id and retry < _MAX_RETRY: retry += 1 print("Retrying connection...", file=sys.stderr) time.sleep(retry**2) job_id = pbs.pbs_submit(connection, pbs_attrs, script_path, queue, None) pbs.pbs_disconnect(connection) #check to see if the job was submitted successfully. if not job_id: e, e_msg = pbs.error() # the batch system returned an error, throw exception raise Exception("Error submitting job. " "Torque error {0}: '{1}'".format( e, torque_strerror(e))) return job_id
def _process(self, batch_list): '''This function execute the change to the batch server''' if ARGS_VERBOSE: _print('class:SaraNodes func:_process input:%s' % str(batch_list), file=sys.stderr) ## Always get the pbs_server name, even in dry-run mode pbs_server = pbs.pbs_default() if not pbs_server: _print('Could not locate a pbs server', file=sys.stderr) sys.exit(1) if ARGS_VERBOSE: _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server, file=sys.stderr) ## If dry-run is not specified create a connection if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) ## Execute the changes for node in batch_list: if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node[0], node[1], 'NULL') if rcode > 0: errno, text = pbs.error() _print('PBS error for node \'%s\': %s (%s)' % (node[0], text, errno), file=sys.stderr) else: _print("pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')" % (node[0], str(node[1]))) ## Close the connection with the batch system if not ARGS_DRYRUN: pbs.pbs_disconnect(pbs_connection)
def pbs_conn(self): conn = pbs.pbs_connect(pbs.pbs_default()) if (conn < 0): err, err_text = pbs.error() self.logging.error('Error in PBS server conncet') self.logging.error('PBS error code ' + str(err) + ': ' + err_text) raise SchedulerError('PBS error', str(err) + ': ' + err_text) return conn
def pbs_conn(self): conn=pbs.pbs_connect(pbs.pbs_default()) if(conn<0): err, err_text = pbs.error() self.logging.error('Error in PBS server conncet') self.logging.error('PBS error code '+str(err)+': '+err_text) raise SchedulerError('PBS error', str(err)+': '+err_text) return conn
def run(self): script_name = 'pylight_script' with open(script_name, 'w') as f: f.write(self.script) self.jobid = pbs.pbs_submit(self.connect(), self.attropl, script_name, 'batch', "NULL") log.info("PBS submits a job %s" % self.jobid) os.remove(script_name) e, text = pbs.error() if e: log.warning("Failed to submit a job: %s", text) self.status = main.job.JOB_STATUS.FAIL self.disconnect()
def run_cluster(self, pbs_server, job_script, settings): import pbs from threading import threa self.settings = copy.deepcopy(settings) # Launch script, wait for output to come back, return when it does # Create the job options struct attropl = pbs.new_attropl(4) # Set the name of the job # attropl[0].name = pbs.ATTR_N attropl[0].value = "inferno_" + self.name # Job is Rerunable # attropl[1].name = pbs.ATTR_r attropl[1].value = "y" # Walltime # attropl[2].name = pbs.ATTR_l attropl[2].resource = "walltime" attropl[2].value = "400" # Nodes # attropl[3].name = pbs.ATTR_l attropl[3].resource = "nodes" attropl[3].value = "1:ppn=4" # Run the job if pbs_server == None: pbs_server = pbs.pbs_default() job_id = pbs.pbs_submit(pbs_server, attropl, job_script, "NULL", "NULL") e, e_txt = pbs.error() if e: print e, e_txt # Save the job ID for later so we can check on the status self.job_id = job_id # TODO: Change this # Now loop, checking every 5 seconds or so if the job is done by # polling the pbs_server about the jobid. running = True while running: job_info = pbs.pbs_statjob(pbs_server, self.job_id, "NULL", "NULL") print job_info time.sleep(5)
def submitScript(script): result = {} try: pbs_connection = pbs.pbs_connect(pbs.pbs_default()) # queues = pbs.pbs_statque(pbs_connection, "batch", "NULL", "NULL") attropl = pbs.new_attropl(4) # Set the name of the job # attropl[0].name = pbs.ATTR_N attropl[0].value = str(script['jobName']) if script['jobName'] else "new_job" # Job is Rerunable # attropl[1].name = pbs.ATTR_r attropl[1].value = 'y' # Walltime # attropl[2].name = pbs.ATTR_l attropl[2].resource = 'walltime' attropl[2].value = str(script['maxTime']) if script['maxTime'] else '01:00:00' # Nodes # attropl[3].name = pbs.ATTR_l attropl[3].resource = 'nodes' attropl[3].value = '1:ppn=' + str(script['cpuNumber']) if script['cpuNumber'] else '1' # A1.tsk is the job script filename # job_id = pbs.pbs_submit(pbs_connection, attropl, str(script['scriptName']), str(script['queue']), 'NULL') e, e_txt = pbs.error() if e: result['Result'] = 'ERROR' result['Message'] = str(e) + ' : ' + e_txt else: result['Result'] = 'OK' result['Message'] = job_id except Exception as exc: result['Result'] = 'ERROR' result['Message'] = str(exc) return result
def kill(self, obj): conn=self.pbs_conn() for job in obj.jobs : if not self.valid( job.runningJob ): continue id=str(job.runningJob['schedulerId']).strip() res=pbs.pbs_deljob(conn, id, '') if res!=0: err, err_text=pbs.error() self.logging.error('Error in job kill for '+id) self.logging.error('PBS error code '+str(err)+': '+err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err)+': '+err_text) self.pbs_disconn(conn)
def kill(self, obj): conn = self.pbs_conn() for job in obj.jobs: if not self.valid(job.runningJob): continue id = str(job.runningJob['schedulerId']).strip() res = pbs.pbs_deljob(conn, id, '') if res != 0: err, err_text = pbs.error() self.logging.error('Error in job kill for ' + id) self.logging.error('PBS error code ' + str(err) + ': ' + err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err) + ': ' + err_text) self.pbs_disconn(conn)
def submit_jobs_pbs(self, jobs): for job in jobs: tool = job.tool command = job.create_command() attropl = self.get_pbs_attr(job.db_job.id, tool.config) script = PBS_SCRIPT % (job.tool.directory, command) log.info(script) script_name = 'pylight_script' with open(script_name, 'w') as f: f.write(script) job_id = pbs.pbs_submit(self.c, attropl, script_name, 'batch', "NULL") os.remove(script_name) e, text = pbs.error() if e: log.warning("Failed to submit a job: %s", text) #what about jobs that following this one? continue log.info("PBS submits a job %s as %s" % (job, job_id)) self.submit_list[job_id] = job
def query(self, obj, service='', objType='node'): """ query status and eventually other scheduler related information It may use single 'node' scheduler id or bulk id for association """ if type(obj) != Task: raise SchedulerError('wrong argument type', str(type(obj))) jobids = [] conn = self.pbs_conn() attrl = pbs.new_attrl(2) attrl[0].name = 'job_state' attrl[1].name = 'exec_host' for job in obj.jobs: if not self.valid(job.runningJob): continue id = str(job.runningJob['schedulerId']).strip() jobstat = pbs.pbs_statjob(conn, id, attrl, 'Null') if not jobstat: err, err_text = pbs.error() if err != 15001: # unknown job (probably finished) self.logging.error('Error in job query for ' + id) self.logging.error('PBS error code ' + str(err) + ': ' + err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err) + ': ' + err_text) host = '' if len(jobstat) == 0: pbs_stat = 'Done' else: pbs_stat = jobstat[0].attribs[0].value if len(jobstat[0].attribs) > 1: host = jobstat[0].attribs[1].value job.runningJob['statusScheduler'] = pbs_stat job.runningJob['status'] = self.status_map[pbs_stat] job.runningJob['destination'] = host self.pbs_disconn(conn)
def _connect_to_server(server): """ open a connection to a pbs_server at hostname server, if server is None then connect to the default server. This function is shared between JobManager and TorqueJobRunner """ if server: connection = pbs.pbs_connect(server) else: connection = pbs.pbs_connect(pbs.pbs_default()) if connection <= 0: e, e_msg = pbs.error() # the batch system returned an error, throw exception raise Exception("Error connecting to pbs_server. " "Torque error {0}: '{1}'".format( e, torque_strerror(e))) return connection
def main(): pbs_server = pbs.pbs_default() if not pbs_server: print 'No default server' sys.exit(1) con = pbs.pbs_connect(pbs_server) attrop_l = pbs.new_attropl(1) attrop_l[0].name = 'properties' attrop_l[0].value = 'set_something_useful' attrop_l[0].op = pbs.INCR r = pbs.pbs_manager(con, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, "e2", attrop_l, 'NULL') if r > 0: print r, ";" errno, text = pbs.error() print errno, text
def set_hold(self, hold_type=None): """Set hold on job of specified type.""" # we can't set this default for hold_type in function signature, # because we need to be able to load this module even when the pbs module is not available if hold_type is None: hold_type = pbs.USER_HOLD # only set hold if it wasn't set before if hold_type not in self.holds: if hold_type not in KNOWN_HOLD_TYPES: self.log.error("set_hold: unknown hold type: %s (supported: %s)" % (hold_type, KNOWN_HOLD_TYPES)) # set hold, check for errors, and keep track of this hold ec = pbs.pbs_holdjob(self.pbsconn, self.jobid, hold_type, NULL) is_error, errormsg = pbs.error() if is_error or ec: tup = (hold_type, self.jobid, is_error, ec, errormsg) self.log.error("Failed to set hold of type %s on job %s (is_error: %s, exit code: %s, msg: %s)" % tup) else: self.holds.append(hold_type) else: self.log.warning("Hold type %s was already set for %s" % (hold_type, self.jobid))
def _process(self, batch_list): '''This function execute the change to the batch server''' if ARGS_VERBOSE: _print('class:SaraNodes func:_process input:%s' % str(batch_list), file=sys.stderr) ## Always get the pbs_server name, even in dry-run mode pbs_server = pbs.pbs_default() if not pbs_server: _print('Could not locate a pbs server', file=sys.stderr) sys.exit(1) if ARGS_VERBOSE: _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server, file=sys.stderr) ## If dry-run is not specified create a connection if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) ## Execute the changes for node in batch_list: if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node[0], node[1], 'NULL') if rcode > 0: errno, text = pbs.error() _print('PBS error for node \'%s\': %s (%s)' % (node[0], text, errno), file=sys.stderr) else: _print( "pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')" % (node[0], str(node[1]))) ## Close the connection with the batch system if not ARGS_DRYRUN: pbs.pbs_disconnect(pbs_connection)
def release_hold(self, hold_type=None): """Release hold on job of specified type.""" # we can't set this default for hold_type in function signature, # because we need to be able to load this module even when the pbs module is not available if hold_type is None: hold_type = pbs.USER_HOLD # only release hold if it was set if hold_type in self.holds: if hold_type not in KNOWN_HOLD_TYPES: self.log.error("release_hold: unknown hold type: %s (supported: %s)" % (hold_type, KNOWN_HOLD_TYPES)) # release hold, check for errors, remove from list of holds ec = pbs.pbs_rlsjob(self.pbsconn, self.jobid, hold_type, NULL) self.log.debug("Released hold of type %s for job %s" % (hold_type, self.jobid)) is_error, errormsg = pbs.error() if is_error or ec: tup = (hold_type, self.jobid, is_error, ec, errormsg) self.log.error("Failed to release hold type %s on job %s (is_error: %s, exit code: %s, msg: %s)" % tup) else: self.holds.remove(hold_type) else: self.log.warning("No hold type %s was set for %s, so skipping hold release" % (hold_type, self.jobid))
def query(self, obj, service='', objType='node') : """ query status and eventually other scheduler related information It may use single 'node' scheduler id or bulk id for association """ if type(obj) != Task : raise SchedulerError('wrong argument type', str( type(obj) )) jobids=[] conn=self.pbs_conn() attrl=pbs.new_attrl(2) attrl[0].name='job_state' attrl[1].name='exec_host' for job in obj.jobs : if not self.valid( job.runningJob ): continue id=str(job.runningJob['schedulerId']).strip() jobstat=pbs.pbs_statjob(conn, id, attrl, 'Null') if not jobstat: err, err_text=pbs.error() if err!=15001: # unknown job (probably finished) self.logging.error('Error in job query for '+id) self.logging.error('PBS error code '+str(err)+': '+err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err)+': '+err_text) host='' if len(jobstat)==0: pbs_stat='Done' else: pbs_stat=jobstat[0].attribs[0].value if len(jobstat[0].attribs)>1: host=jobstat[0].attribs[1].value job.runningJob['statusScheduler']=pbs_stat job.runningJob['status'] = self.status_map[pbs_stat] job.runningJob['destination']=host self.pbs_disconn(conn)
def __init__(self, settings): # spawn own thread, start up queue, start connection to server self.queue = [] self.finished = [] self.running = [] self.error = [] self.job_ids = [] self.threads = [] self.use_cluster = False self.connection = None self.curr_id = 0 self.settings = None running_threads = 0 max_threads = 1 self.settings = settings if settings["global"]["use_cluster"] == True: import pbs self.use_cluster = True else: self.use_cluster = False self.max_threads = settings["global"]["n_processors"] if self.use_cluster: # Establish connection to PBS server serv_addr = settings["global"]["cluster_address"] # Let the cluster's jobman handle scheduling self.max_threads = sys.maxint self.connection = pbs.pbs_connect(serv_addr) if self.connection < 0: errno, text = pbs.error() print "Error, unable to establish connection to PBS server." print errno, text sys.exit(1)
def queue_job( self, job_wrapper ): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not( self.app.config.pbs_stage_path ) ): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop('-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params['destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error( "Connection to PBS server for submit failed: %s: %s" % ( errno, text ) ) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [ str( o ) for o in output_fnames ] output_files.append(ecfile) stagein = self.get_stage_in_out( job_wrapper.get_input_fnames() + output_files, symlink=True ) stageout = self.get_stage_in_out( output_files ) attrs = [ dict( name=pbs.ATTR_o, value=pbs_ofile ), dict( name=pbs.ATTR_e, value=pbs_efile ), dict( name=pbs.ATTR_stagein, value=stagein ), dict( name=pbs.ATTR_stageout, value=stageout ), ] # If not, we're using NFS else: attrs = [ dict( name=pbs.ATTR_o, value=ofile ), dict( name=pbs.ATTR_e, value=efile ), ] # define PBS job options attrs.append( dict( name=pbs.ATTR_N, value=str( "%s_%s_%s" % ( job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user ) ) ) ) job_attrs = pbs.new_attropl( len( attrs ) + len( pbs_options ) ) for i, attr in enumerate( attrs + pbs_options ): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath( job_wrapper.working_directory ) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with open(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join( job_wrapper.get_input_fnames() + output_files ), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [ stage_commands ] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script( job_file, script ) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id ) pbs.pbs_disconnect(c) if job_wrapper.cleanup_job in ( "always", "onsuccess" ): self.cleanup( ( ofile, efile, ecfile, job_file ) ) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % ( galaxy_job_id, job_file ) ) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning( "(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text) ) time.sleep(2) else: log.error( "(%s) All attempts to submit job failed" % galaxy_job_id ) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) ) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id) ) # persist destination job_wrapper.set_job_destination( job_destination, job_id ) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put( job_state )
def pbs_batch( self, nodes, attrs=None, note_attributes=None ): nodeserror = list() if not attrs and not note_attributes: raise sara_nodesException, 'attrs and note_attributes can not be empty together!' if not self.dryrun: if note_attributes and len( note_attributes ) == 3: if attrs: attributes = attrs + pbs.new_attropl(1) attributes[1].name = pbs.ATTR_NODE_note attributes[1].op = pbs.SET else: attributes = pbs.new_attropl(1) attributes[0].name = pbs.ATTR_NODE_note attributes[0].op = pbs.SET else: attributes = attrs # Some hacking here because some limitation in the Torque 2.4 version # fetching note data first for all nodes! tmp_node_note = dict() for node in nodes: if note_attributes and len( note_attributes ) == 3: tmp_node_note[ node ] = self.note( node, note_attributes ) pbs_server = pbs.pbs_default() if not pbs_server: raise sara_nodesException, 'Default pbs server not found!' pbs_connection = pbs.pbs_connect( pbs_server ) for node in nodes: if note_attributes and len( note_attributes ) == 3: try: if attrs: attributes[1].value = tmp_node_note[ node ] else: attributes[0].value = tmp_node_note[ node ] except KeyError: pass rcode = pbs.pbs_manager( pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node, attributes, 'NULL' ) if rcode > 0: errno, text = pbs.error() nodeserror.append( '%s: %s (%s)' % ( node, text, errno ) ) else: p = PBSQuery.PBSQuery() pbsnodes = p.getnodes().keys() print '%*s:' % ( 7, 'Nodes' ), firstitem = True for node in nodes: if node in pbsnodes: if firstitem: print '%s' % node firstitem = False else: print '%*s' % ( 17, node ) else: nodeserror.append( '%s: does not exist' % node ) if len( nodeserror ) > 0: raise sara_nodesException, nodeserror
def queue_job(self, batch_job): """ queue a BatchJob. :param batch_job: description of the job to queue """ # batch job names should be unique for civet pipelines because the # job name is used to name log files and other output # Civet generates unique names for each step, so this is just checking # for a programming error assert batch_job.name not in self._job_names if self.execution_log_dir: log_dir = self.execution_log_dir else: log_dir = self.log_dir # set batch_job.stderr_path and batch_job.stdout_path if they aren't already set if not batch_job.stdout_path: batch_job.stdout_path = os.path.join(log_dir, batch_job.name + ".o") if not batch_job.stderr_path: batch_job.stderr_path = os.path.join(log_dir, batch_job.name + ".e") # write batch script filename = self.write_script(batch_job) if self.submit: # build up our torque job attributes and resources job_attributes = {} job_resources = { 'nodes': "{0}:ppn={1}".format(batch_job.nodes, batch_job.ppn), 'walltime': batch_job.walltime, 'epilogue': self.epilogue_filename } if batch_job.mem: job_resources['mem'] = batch_job.mem job_attributes[pbs.ATTR_v] = self.generate_env(batch_job.workdir) if batch_job.name: job_attributes[pbs.ATTR_N] = batch_job.name job_attributes[pbs.ATTR_o] = batch_job.stdout_path #XXX workaround for a TORQUE bug where local copies of stderr & # stdout files to /dev/null don't work correctly but remote # copies (to submit host) do if job_attributes[pbs.ATTR_o] == "/dev/null": job_attributes[ pbs.ATTR_o] = socket.gethostname() + ":/dev/null" job_attributes[pbs.ATTR_e] = batch_job.stderr_path #XXX workaround for a TORQUE bug where local copies of stderr & # stdout files to /dev/null don't work correctly but remote # copies (to submit host) do if job_attributes[pbs.ATTR_e] == "/dev/null": job_attributes[ pbs.ATTR_e] = socket.gethostname() + ":/dev/null" if batch_job.depends_on: job_attributes[pbs.ATTR_depend] = self._dependency_string( batch_job) elif self.submit_with_hold: job_attributes[pbs.ATTR_h] = 'u' if batch_job.mail_option: job_attributes[pbs.ATTR_m] = batch_job.mail_option if batch_job.email_list: job_attributes[pbs.ATTR_M] = batch_job.email_list if batch_job.date_time: job_attributes[pbs.ATTR_a] = str( int(time.mktime(batch_job.date_time.timetuple()))) pbs_attrs = pbs.new_attropl( len(job_attributes) + len(job_resources)) # populate pbs_attrs attr_idx = 0 for resource, val in job_resources.iteritems(): pbs_attrs[attr_idx].name = pbs.ATTR_l pbs_attrs[attr_idx].resource = resource pbs_attrs[attr_idx].value = val attr_idx += 1 for attribute, val in job_attributes.iteritems(): pbs_attrs[attr_idx].name = attribute pbs_attrs[attr_idx].value = val attr_idx += 1 # we've initialized pbs_attrs with all the attributes we need to set # now we can connect to the server and submit the job connection = _connect_to_server(self._server) # connected to pbs_server # submit job retry = 0 job_id = pbs.pbs_submit(connection, pbs_attrs, filename, self.queue, None) # if pbs.pbs_submit failed, try again while not job_id and retry < _MAX_RETRY: retry += 1 print("Retrying connection...", file=sys.stderr) time.sleep(retry**2) job_id = pbs.pbs_submit(connection, pbs_attrs, filename, self.queue, None) pbs.pbs_disconnect(connection) # check to see if the job was submitted successfully. if not job_id: e, e_msg = pbs.error() # the batch system returned an error, throw exception raise Exception("Error submitting job. " "Torque error {0}: '{1}'".format( e, torque_strerror(e))) if self.submit_with_hold and not batch_job.depends_on: self.held_jobs.append(job_id) else: #self.submit is False, fake a job ID job_id = "{0}.civet".format(self._id_seq) self._id_seq += 1 self._job_names.append(batch_job.name) self._id_log.write( job_id + '\t' + batch_job.name + '\t' + str(self._printable_dependencies(batch_job.depends_on)) + '\n') self._id_log.flush() return job_id
def submit(self, with_hold=False): """Submit the jobscript txt, set self.jobid""" txt = self.script self.log.debug("Going to submit script %s" % txt) # Build default pbs_attributes list pbs_attributes = pbs.new_attropl(1) pbs_attributes[0].name = pbs.ATTR_N # Job_Name pbs_attributes[0].value = self.name # set resource requirements resourse_attributes = pbs.new_attropl(len(self.resources)) idx = 0 for k, v in self.resources.items(): resourse_attributes[idx].name = pbs.ATTR_l # Resource_List resourse_attributes[idx].resource = k resourse_attributes[idx].value = v idx += 1 pbs_attributes.extend(resourse_attributes) # add job dependencies to attributes if self.deps: deps_attributes = pbs.new_attropl(1) deps_attributes[0].name = pbs.ATTR_depend deps_attributes[0].value = ",".join(["afterany:%s" % dep for dep in self.deps]) pbs_attributes.extend(deps_attributes) self.log.debug("Job deps attributes: %s" % deps_attributes[0].value) # submit job with (user) hold if requested if with_hold: hold_attributes = pbs.new_attropl(1) hold_attributes[0].name = pbs.ATTR_h hold_attributes[0].value = pbs.USER_HOLD pbs_attributes.extend(hold_attributes) self.holds.append(pbs.USER_HOLD) self.log.debug("Job hold attributes: %s" % hold_attributes[0].value) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] pbsvars = ["PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars] # extend PBS variables with specified variables pbsvars.extend(["%s=%s" % (name, value) for (name, value) in self.env_vars.items()]) variable_attributes = pbs.new_attropl(1) variable_attributes[0].name = pbs.ATTR_v # Variable_List variable_attributes[0].value = ",".join(pbsvars) pbs_attributes.extend(variable_attributes) self.log.debug("Job variable attributes: %s" % variable_attributes[0].value) # mail settings mail_attributes = pbs.new_attropl(1) mail_attributes[0].name = pbs.ATTR_m # Mail_Points mail_attributes[0].value = 'n' # disable all mail pbs_attributes.extend(mail_attributes) self.log.debug("Job mail attributes: %s" % mail_attributes[0].value) fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temporary job script to %s" % scriptfn) f.write(txt) f.close() self.log.debug("Going to submit to queue %s" % self.queue) # job submission sometimes fails without producing an error, e.g. when one of the dependency jobs has already finished # when that occurs, None will be returned by pbs_submit as job id jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, NULL) is_error, errormsg = pbs.error() if is_error or jobid is None: self.log.error("Failed to submit job script %s (job id: %s, error %s)" % (scriptfn, jobid, errormsg)) else: self.log.debug("Succesful job submission returned jobid %s" % jobid) self.jobid = jobid os.remove(scriptfn)
def _submit(self): """Submit the jobscript txt, set self.jobid""" txt = self.script self.log.debug("Going to submit script %s" % txt) # Build default pbs_attributes list pbs_attributes = pbs.new_attropl(3) pbs_attributes[0].name = pbs.ATTR_N # Job_Name pbs_attributes[0].value = self.name output_dir = build_option('job_output_dir') pbs_attributes[1].name = pbs.ATTR_o pbs_attributes[1].value = os.path.join(output_dir, '%s.o$PBS_JOBID' % self.name) pbs_attributes[2].name = pbs.ATTR_e pbs_attributes[2].value = os.path.join(output_dir, '%s.e$PBS_JOBID' % self.name) # set resource requirements resource_attributes = pbs.new_attropl(len(self.resources)) idx = 0 for k, v in self.resources.items(): resource_attributes[idx].name = pbs.ATTR_l # Resource_List resource_attributes[idx].resource = k resource_attributes[idx].value = v idx += 1 pbs_attributes.extend(resource_attributes) # add job dependencies to attributes if self.deps: deps_attributes = pbs.new_attropl(1) deps_attributes[0].name = pbs.ATTR_depend deps_attributes[0].value = ",".join( ["afterany:%s" % dep.jobid for dep in self.deps]) pbs_attributes.extend(deps_attributes) self.log.debug("Job deps attributes: %s" % deps_attributes[0].value) # submit job with (user) hold hold_attributes = pbs.new_attropl(1) hold_attributes[0].name = pbs.ATTR_h hold_attributes[0].value = pbs.USER_HOLD pbs_attributes.extend(hold_attributes) self.holds.append(pbs.USER_HOLD) self.log.debug("Job hold attributes: %s" % hold_attributes[0].value) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] pbsvars = [ "PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars ] # extend PBS variables with specified variables pbsvars.extend([ "%s=%s" % (name, value) for (name, value) in self.env_vars.items() ]) variable_attributes = pbs.new_attropl(1) variable_attributes[0].name = pbs.ATTR_v # Variable_List variable_attributes[0].value = ",".join(pbsvars) pbs_attributes.extend(variable_attributes) self.log.debug("Job variable attributes: %s" % variable_attributes[0].value) # mail settings mail_attributes = pbs.new_attropl(1) mail_attributes[0].name = pbs.ATTR_m # Mail_Points mail_attributes[0].value = 'n' # disable all mail pbs_attributes.extend(mail_attributes) self.log.debug("Job mail attributes: %s" % mail_attributes[0].value) fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temporary job script to %s" % scriptfn) f.write(txt) f.close() self.log.debug("Going to submit to queue %s" % self.queue) # job submission sometimes fails without producing an error, e.g. when one of the dependency jobs has already finished # when that occurs, None will be returned by pbs_submit as job id jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, NULL) is_error, errormsg = pbs.error() if is_error or jobid is None: raise EasyBuildError( "Failed to submit job script %s (job id: %s, error %s)", scriptfn, jobid, errormsg) else: self.log.debug("Succesful job submission returned jobid %s" % jobid) self.jobid = jobid os.remove(scriptfn)
def queue_job(self, job_wrapper): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not (self.app.config.pbs_stage_path)): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop( '-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params[ 'destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error("Connection to PBS server for submit failed: %s: %s" % (errno, text)) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [str(o) for o in output_fnames] output_files.append(ecfile) stagein = self.get_stage_in_out(job_wrapper.get_input_fnames() + output_files, symlink=True) stageout = self.get_stage_in_out(output_files) attrs = [ dict(name=pbs.ATTR_o, value=pbs_ofile), dict(name=pbs.ATTR_e, value=pbs_efile), dict(name=pbs.ATTR_stagein, value=stagein), dict(name=pbs.ATTR_stageout, value=stageout), ] # If not, we're using NFS else: attrs = [ dict(name=pbs.ATTR_o, value=ofile), dict(name=pbs.ATTR_e, value=efile), ] # define PBS job options attrs.append( dict(name=pbs.ATTR_N, value=str("%s_%s_%s" % (job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user)))) job_attrs = pbs.new_attropl(len(attrs) + len(pbs_options)) for i, attr in enumerate(attrs + pbs_options): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath(job_wrapper.working_directory) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with file(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join(job_wrapper.get_input_fnames() + output_files), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [stage_commands] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script(job_file, script) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id) pbs.pbs_disconnect(c) if self.app.config.cleanup_job in ("always", "onsuccess"): self.cleanup((ofile, efile, ecfile, job_file)) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % (galaxy_job_id, job_file)) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text)) time.sleep(2) else: log.error("(%s) All attempts to submit job failed" % galaxy_job_id) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id)) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id)) # persist destination job_wrapper.set_job_destination(job_destination, job_id) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put(job_state)
def main(): state_list = [] node_list = [] node_nr = 0 if len(sys.argv) > 1: pbs_server = sys.argv[1] else: pbs_server = pbs.pbs_default() if not pbs_server: print "No default pbs server, usage: pbsmon [server] " sys.exit(1) con = pbs.pbs_connect(pbs_server) if con < 0: errno, text = pbs.error() print errno, text sys.exit(1) # We are only interested in the state and jobs of a node # attrl = pbs.new_attrl(2) attrl[0].name = "state" attrl[1].name = "jobs" nodes = pbs.pbs_statnode(con, "", attrl, "NULL") # Some is het None dan weer NULL, beats me # for node in nodes: # display_node_status(batch_info) node_attr = node.attribs # A node can have serveral states, huh. We are only # interested in first entry. # temp = string.splitfields(node_attr[0].value, ",") state = temp[0] # look if on a free node a job is scheduled, then mark it # as other state # if state == pbs.ND_free: if len([x for x in node_attr if x.name == "jobs"]): state_list.append(translate_state[pbs_ND_free_and_job]) else: state_list.append(translate_state[state]) else: state_list.append(translate_state[state]) re_host = re.compile( r""" (?P<name>\d+) """, re.VERBOSE, ) result = re_host.search(node.name) if result: node_list.append(result.group("name")) else: node_nr = node_nr + 1 node_list.append(str(node_nr)) display_cluster_status(node_list, state_list)
def submitJob ( self, conn, job, task=None, requirements=''): """ Need to copy the inputsandbox to WN before submitting a job""" # Write a temporary submit script # NB: we assume an env var PBS_JOBCOOKIE points to the exec dir on the batch host ifiles=task['globalSandbox'].split(',') f=tempfile.NamedTemporaryFile() s=[] s.append('#!/bin/sh'); if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) s.append('if [ ! -d $PBS_JOBCOOKIE ] ; then mkdir -p $PBS_JOBCOOKIE ; fi') s.append('cd $PBS_JOBCOOKIE') for ifile in task['globalSandbox'].split(','): s.append('cp '+ifile+' .') s.append(self.jobScriptDir + job['executable']+' '+ job['arguments'] +\ ' >' + job['standardOutput'] + ' 2>' + job['standardError']) if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) # this fails if the job is aborted, which leaks disc space. Adding an epilogue to make # sure it's gone for good - AMM 18/07/2011 s.append('rm -fr $PBS_JOBCOOKIE') f.write('\n'.join(s)) f.flush() epilogue = tempfile.NamedTemporaryFile() s = [] s.append('#!/bin/sh'); if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) s.append('rm -fr $PBS_JOBCOOKIE') s.append('touch $HOME/done.$1') epilogue.write( '\n'.join( s ) ) epilogue.flush() os.chmod( epilogue.name, 700 ) attr_dict={'Job_Name':'CRAB_PBS', 'Variable_List':self.pbs_env, 'Output_Path':self.jobResDir+'wrapper_'+str(job['standardOutput']), 'Error_Path':self.jobResDir+'wrapper_'+str(job['standardError']) } attropl=pbs.new_attropl(len(attr_dict)+len(self.res_dict) + 1) i_attr=0 for k in attr_dict.keys(): self.logging.debug("adding k %s" % k) attropl[i_attr].name=k attropl[i_attr].value=attr_dict[k] i_attr+=1 for k in self.res_dict.keys(): attropl[i_attr].name='Resource_List' attropl[i_attr].resource=k attropl[i_attr].value=self.res_dict[k] i_attr+=1 attropl[i_attr].name = 'Resource_List' attropl[i_attr].resource = 'epilogue' attropl[i_attr].value = epilogue.name self.logging.debug("adding epilogue: %s" % epilogue.name) i_attr += 1 jobid = pbs.pbs_submit(conn, attropl, f.name, self.queue, 'NULL') f.close() if not jobid: err, err_text=pbs.error() self.logging.error('Error in job submission') self.logging.error('PBS error code '+str(err)+': '+err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err)+': '+err_text) return {job['name']:jobid}, None, None
def submit(self): """Submit the jobscript txt, set self.jobid""" txt = self.script self.log.debug("Going to submit script %s" % txt) # Build default pbs_attributes list pbs_attributes = pbs.new_attropl(1) pbs_attributes[0].name = 'Job_Name' pbs_attributes[0].value = self.name # set resource requirements resourse_attributes = pbs.new_attropl(len(self.resources)) idx = 0 for k, v in self.resources.items(): resourse_attributes[idx].name = 'Resource_List' resourse_attributes[idx].resource = k resourse_attributes[idx].value = v idx += 1 pbs_attributes.extend(resourse_attributes) # add job dependencies to attributes if self.deps: deps_attributes = pbs.new_attropl(1) deps_attributes[0].name = pbs.ATTR_depend deps_attributes[0].value = ",".join(["afterany:%s" % dep for dep in self.deps]) pbs_attributes.extend(deps_attributes) self.log.debug("Job deps attributes: %s" % deps_attributes[0].value) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] pbsvars = ["PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars] # extend PBS variables with specified variables pbsvars.extend(["%s=%s" % (name, value) for (name, value) in self.env_vars.items()]) variable_attributes = pbs.new_attropl(1) variable_attributes[0].name = 'Variable_List' variable_attributes[0].value = ",".join(pbsvars) pbs_attributes.extend(variable_attributes) self.log.debug("Job variable attributes: %s" % variable_attributes[0].value) # mail settings mail_attributes = pbs.new_attropl(1) mail_attributes[0].name = 'Mail_Points' mail_attributes[0].value = 'n' # disable all mail pbs_attributes.extend(mail_attributes) self.log.debug("Job mail attributes: %s" % mail_attributes[0].value) import tempfile fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temporary job script to %s" % scriptfn) f.write(txt) f.close() self.log.debug("Going to submit to queue %s" % self.queue) # extend paramater should be 'NULL' because this is required by the python api extend = 'NULL' # job submission sometimes fails without producing an error, e.g. when one of the dependency jobs has already finished # when that occurs, None will be returned by pbs_submit as job id jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, extend) is_error, errormsg = pbs.error() if is_error or jobid is None: self.log.error("Failed to submit job script %s (job id: %s, error %s)" % (scriptfn, jobid, errormsg)) else: self.log.debug("Succesful job submission returned jobid %s" % jobid) self.jobid = jobid os.remove(scriptfn)
def submit(self): """Submit the jobscript txt, set self.jobid""" txt = self.script self.log.debug("Going to submit script %s" % txt) # Build default pbs_attributes list pbs_attributes = pbs.new_attropl(1) pbs_attributes[0].name = 'Job_Name' pbs_attributes[0].value = self.name # set resource requirements resourse_attributes = pbs.new_attropl(len(self.resources)) idx = 0 for k, v in self.resources.items(): resourse_attributes[idx].name = 'Resource_List' resourse_attributes[idx].resource = k resourse_attributes[idx].value = v idx += 1 pbs_attributes.extend(resourse_attributes) # add job dependencies to attributes if self.deps: deps_attributes = pbs.new_attropl(1) deps_attributes[0].name = pbs.ATTR_depend deps_attributes[0].value = ",".join(["afterany:%s" % dep for dep in self.deps]) pbs_attributes.extend(deps_attributes) self.log.debug("Job deps attributes: %s" % deps_attributes[0].value) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] pbsvars = ["PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars] # extend PBS variables with specified variables pbsvars.extend(["%s=%s" % (name, value) for (name, value) in self.env_vars.items()]) variable_attributes = pbs.new_attropl(1) variable_attributes[0].name = 'Variable_List' variable_attributes[0].value = ",".join(pbsvars) pbs_attributes.extend(variable_attributes) self.log.debug("Job variable attributes: %s" % variable_attributes[0].value) # mail settings mail_attributes = pbs.new_attropl(1) mail_attributes[0].name = 'Mail_Points' mail_attributes[0].value = 'n' # disable all mail pbs_attributes.extend(mail_attributes) self.log.debug("Job mail attributes: %s" % mail_attributes[0].value) import tempfile fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temporary job script to %s" % scriptfn) f.write(txt) f.close() self.log.debug("Going to submit to queue %s" % self.queue) # extend paramater should be 'NULL' because this is required by the python api extend = 'NULL' jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, extend) is_error, errormsg = pbs.error() if is_error: self.log.error("Failed to submit job script %s: error %s" % (scriptfn, errormsg)) else: self.log.debug("Succesful job submission returned jobid %s" % jobid) self.jobid = jobid os.remove(scriptfn)
def main(): state_list = [] node_list = [] node_nr = 0 if len(sys.argv) > 1: pbs_server = sys.argv[1] else: pbs_server = pbs.pbs_default() if not pbs_server: print "No default pbs server, usage: pbsmon [server] " sys.exit(1) con = pbs.pbs_connect(pbs_server) if con < 0: errno, text = pbs.error() print errno, text sys.exit(1) # We are only interested in the state and jobs of a node # attrl = pbs.new_attrl(2) attrl[0].name = 'state' attrl[1].name = 'jobs' nodes = pbs.pbs_statnode(con, "", attrl, "NULL") # Some is het None dan weer NULL, beats me # for node in nodes: # display_node_status(batch_info) node_attr = node.attribs # A node can have serveral states, huh. We are only # interested in first entry. # temp = string.splitfields(node_attr[0].value, ',') state = temp[0] # look if on a free node a job is scheduled, then mark it # as other state # if state == pbs.ND_free: if len([x for x in node_attr if x.name == 'jobs']): state_list.append(translate_state[pbs_ND_free_and_job]) else: state_list.append(translate_state[state]) else: state_list.append(translate_state[state]) re_host = re.compile(r""" (?P<name>\d+) """, re.VERBOSE) result = re_host.search(node.name) if result: node_list.append(result.group('name')) else: node_nr = node_nr + 1 node_list.append(str(node_nr)) display_cluster_status(node_list, state_list)
def check_watched_items( self ): """ Called by the monitor thread to look at each watched job and deal with state changes. """ new_watched = [] # reduce pbs load by batching status queries ( failures, statuses ) = self.check_all_jobs() for pbs_job_state in self.watched: job_id = pbs_job_state.job_id galaxy_job_id = pbs_job_state.job_wrapper.get_id_tag() old_state = pbs_job_state.old_state pbs_server_name = self.__get_pbs_server(pbs_job_state.job_destination.params) if pbs_server_name in failures: log.debug( "(%s/%s) Skipping state check because PBS server connection failed" % ( galaxy_job_id, job_id ) ) new_watched.append( pbs_job_state ) continue try: status = statuses[job_id] except KeyError: if pbs_job_state.job_wrapper.get_state() == model.Job.states.DELETED: continue try: # Recheck to make sure it wasn't a communication problem self.check_single_job( pbs_server_name, job_id ) log.warning( "(%s/%s) PBS job was not in state check list, but was found with individual state check" % ( galaxy_job_id, job_id ) ) new_watched.append( pbs_job_state ) except: errno, text = pbs.error() if errno == 15001: # 15001 == job not in queue log.debug("(%s/%s) PBS job has left queue" % (galaxy_job_id, job_id) ) self.work_queue.put( ( self.finish_job, pbs_job_state ) ) else: # Unhandled error, continue to monitor log.info("(%s/%s) PBS state check resulted in error (%d): %s" % (galaxy_job_id, job_id, errno, text) ) new_watched.append( pbs_job_state ) continue if status.job_state != old_state: log.debug("(%s/%s) PBS job state changed from %s to %s" % ( galaxy_job_id, job_id, old_state, status.job_state ) ) if status.job_state == "R" and not pbs_job_state.running: pbs_job_state.running = True pbs_job_state.job_wrapper.change_state( model.Job.states.RUNNING ) if status.job_state == "R" and status.get( 'resources_used', False ): # resources_used may not be in the status for new jobs h, m, s = [ int( i ) for i in status.resources_used.walltime.split( ':' ) ] runtime = timedelta( 0, s, 0, 0, m, h ) if pbs_job_state.check_limits( runtime=runtime ): self.work_queue.put( ( self.fail_job, pbs_job_state ) ) continue elif status.job_state == "C": # "keep_completed" is enabled in PBS, so try to check exit status try: assert int( status.exit_status ) == 0 log.debug("(%s/%s) PBS job has completed successfully" % ( galaxy_job_id, job_id ) ) except AssertionError: exit_status = int( status.exit_status ) error_message = JOB_EXIT_STATUS.get( exit_status, 'Unknown error: %s' % status.exit_status ) pbs_job_state.fail_message = CLUSTER_ERROR_MESSAGE % error_message log.error( '(%s/%s) PBS job failed: %s' % ( galaxy_job_id, job_id, error_message ) ) pbs_job_state.stop_job = False self.work_queue.put( ( self.fail_job, pbs_job_state ) ) continue except AttributeError: # No exit_status, can't verify proper completion so we just have to assume success. log.debug("(%s/%s) PBS job has completed" % ( galaxy_job_id, job_id ) ) self.work_queue.put( ( self.finish_job, pbs_job_state ) ) continue pbs_job_state.old_state = status.job_state new_watched.append( pbs_job_state ) # Replace the watch list with the updated version self.watched = new_watched
def submit(self, txt): """Submit the jobscript txt, set self.jobid""" self.log.debug("Going to submit script %s", txt) attropl = pbs.new_attropl(2) # jobparams attropl[0].name = 'Job_Name' attropl[0].value = self.options.get('name', 'python_pbs_job') attropl[1].name = 'Rerunable' attropl[1].value = 'y' for arg in self.args.keys(): tmp = self.args[arg] tmpattropl = pbs.new_attropl(len(tmp)) # jobparams if arg in ('resources',): idx = 0 for k, v in tmp.items(): tmpattropl[idx].name = 'Resource_List' # resources tmpattropl[idx].resource = k tmpattropl[idx].value = v idx += 1 elif arg in ('mail',): tmpattropl[0].name = 'Mail_Points' tmpattropl[0].value = tmp['send'] if len(tmp) > 1: tmpattropl[0].name = "Mail_Users" tmpattropl[0].value = tmp['others'] elif arg in ('queue',): # # use destination field of pbs_submit pass elif arg in ('account',): tmpattropl = pbs.new_attropl(1) tmpattropl[0].name = pbs.ATTR_A tmpattropl[0].value = tmp #continue else: self.log.error('Unknown arg %s', arg) tmpattropl = pbs.new_attropl(0) attropl.extend(tmpattropl) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] tmpattropl = pbs.new_attropl(1) tmpattropl[0].name = 'Variable_List' tmpattropl[0].value = ",".join(["PBS_O_%s=%s" % ( x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars]) attropl.extend(tmpattropl) fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temp jobscript to %s" % scriptfn) f.write(txt) f.close() queue = self.args.get( 'queue', self.options.get('queue', '')) # do not set with attropl if queue: self.log.debug("Going to submit to queue %s", queue) else: self.log.debug("No queue specified. Will submit to default destination.") extend = 'NULL' # always jobid = pbs.pbs_submit(self.pbsconn, attropl, scriptfn, queue, extend) is_error, errormsg = pbs.error() if is_error: self.log.error("Failed to submit job script %s: error %s", scriptfn, errormsg) else: self.log.debug("Succesful jobsubmission returned jobid %s", jobid) self.jobid = jobid os.remove(scriptfn)
def submit_pbs(self, name, taskfile, lastid=None): targets = self.targets target = targets[name] subenv = target["env"].asdict() target["attrs"].setdefault(pbs.ATTR_N, name) # Just include all variables by default varlist = ",".join("%s=%s" % (k, v) for k, v in subenv.iteritems()) target["attrs"].setdefault(pbs.ATTR_v, varlist) # Track job dependencies dependencies = [] dep_type = name.partition("::")[-1] or "afterok" for dep in target["components"]: dependencies.append("%s:%s" % (dep_type, targets[dep]["torqueid"])) if lastid: dependencies.append("%s:%s" % (dep_type, lastid)) if dependencies: target["attrs"][pbs.ATTR_depend] = ",".join(dependencies) # /bin/sh as a default shell will generally do the right thing. # It honors #! syntax at the beginning of the file and it # interprets basic commands without a #! at the beginning of # the file. Obscure users can opt for other shells # (eg: bash,csh,ksh,python,...) via the standard #! syntax # -- This default ensures users with non-standard shells # can still use pbsmake files from other users. target["attrs"].setdefault(pbs.ATTR_S, "/bin/sh") # We need to handle ATTR_l specially. Each resource needs its own # attropl with the name pbs.ATTR_l: attr_l = [] if pbs.ATTR_l in target["attrs"]: attr_l = target["attrs"][pbs.ATTR_l].split(",") del (target["attrs"][pbs.ATTR_l]) # Attach attributes to job as the pbs module expects it attropl = pbs.new_attropl(len(target["attrs"]) + len(attr_l)) i = 0 for n in target["attrs"]: attropl[i].name = n attropl[i].value = target["env"].interp(target["attrs"][n], defer=False) i += 1 for n in attr_l: attropl[i].name = pbs.ATTR_l res, val = n.split("=", 1) attropl[i].resource = res attropl[i].value = target["env"].interp(val, defer=False) i += 1 try: destination = target["attrs"]["queue"] except KeyError: destination = "" # attempt to submit job lastid = pbs.pbs_submit(self.conn, attropl, taskfile.name, destination, "") if lastid: target["torqueid"] = lastid else: print "Error submitting job: %s\n\tAttributes:" % name for attr, val in target["attrs"].items(): print "\t\t%s: %s" % (attr, val) raise Exception(pbs.error()) return lastid
def check_watched_items(self): """ Called by the monitor thread to look at each watched job and deal with state changes. """ new_watched = [] # reduce pbs load by batching status queries (failures, statuses) = self.check_all_jobs() for pbs_job_state in self.watched: job_id = pbs_job_state.job_id galaxy_job_id = pbs_job_state.job_wrapper.get_id_tag() old_state = pbs_job_state.old_state pbs_server_name = self.__get_pbs_server( pbs_job_state.job_destination.params) if pbs_server_name in failures: log.debug( "(%s/%s) Skipping state check because PBS server connection failed" % (galaxy_job_id, job_id)) new_watched.append(pbs_job_state) continue try: status = statuses[job_id] except KeyError: if pbs_job_state.job_wrapper.get_state( ) == model.Job.states.DELETED: continue try: # Recheck to make sure it wasn't a communication problem self.check_single_job(pbs_server_name, job_id) log.warning( "(%s/%s) PBS job was not in state check list, but was found with individual state check" % (galaxy_job_id, job_id)) new_watched.append(pbs_job_state) except: errno, text = pbs.error() if errno == 15001: # 15001 == job not in queue log.debug("(%s/%s) PBS job has left queue" % (galaxy_job_id, job_id)) self.work_queue.put((self.finish_job, pbs_job_state)) else: # Unhandled error, continue to monitor log.info( "(%s/%s) PBS state check resulted in error (%d): %s" % (galaxy_job_id, job_id, errno, text)) new_watched.append(pbs_job_state) continue if status.job_state != old_state: log.debug("(%s/%s) PBS job state changed from %s to %s" % (galaxy_job_id, job_id, old_state, status.job_state)) if status.job_state == "R" and not pbs_job_state.running: pbs_job_state.running = True pbs_job_state.job_wrapper.change_state( model.Job.states.RUNNING) if status.job_state == "R" and status.get('resources_used', False): # resources_used may not be in the status for new jobs h, m, s = [ int(i) for i in status.resources_used.walltime.split(':') ] runtime = timedelta(0, s, 0, 0, m, h) if pbs_job_state.check_limits(runtime=runtime): self.work_queue.put((self.fail_job, pbs_job_state)) continue elif status.job_state == "C": # "keep_completed" is enabled in PBS, so try to check exit status try: assert int(status.exit_status) == 0 log.debug("(%s/%s) PBS job has completed successfully" % (galaxy_job_id, job_id)) except AssertionError: exit_status = int(status.exit_status) error_message = JOB_EXIT_STATUS.get( exit_status, 'Unknown error: %s' % status.exit_status) pbs_job_state.fail_message = CLUSTER_ERROR_MESSAGE % error_message log.error('(%s/%s) PBS job failed: %s' % (galaxy_job_id, job_id, error_message)) pbs_job_state.stop_job = False self.work_queue.put((self.fail_job, pbs_job_state)) continue except AttributeError: # No exit_status, can't verify proper completion so we just have to assume success. log.debug("(%s/%s) PBS job has completed" % (galaxy_job_id, job_id)) self.work_queue.put((self.finish_job, pbs_job_state)) continue pbs_job_state.old_state = status.job_state new_watched.append(pbs_job_state) # Replace the watch list with the updated version self.watched = new_watched
def pp_predict_motifs(fastafile, outfile, analysis="small", organism="hg18", single=False, background="", tools=None, job_server="", ncpus=8, logger=None, max_time=None, fg_file=None, bg_file=None): if tools is None: tools = {} config = MotifConfig() if not tools: tools = dict([(x,1) for x in config.get_default_params["tools"].split(",")]) #logger = logging.getLogger('prediction.pp_predict_motifs') wmin = 5 step = 1 if analysis in ["large","xl"]: step = 2 wmin = 6 analysis_max = {"xs":5,"small":8, "medium":10,"large":14, "xl":20} wmax = analysis_max[analysis] if analysis == "xs": sys.stderr.write("Setting analysis xs to small") analysis = "small" jobs = {} result = PredictionResult(outfile, logger=logger, fg_file=fg_file, bg_file=bg_file) # Dynamically load all tools toolio = [x[1]() for x in inspect.getmembers( tool_classes, lambda x: inspect.isclass(x) and issubclass(x, tool_classes.MotifProgram) ) if x[0] != 'MotifProgram'] # TODO: # Add warnings for running time: Weeder GADEM # Prepare PBS submission server = pbs.pbs_default() c = pbs.pbs_connect(server) q = PBSQuery() attropl = pbs.new_attropl(6) # Name attropl[0].name = pbs.ATTR_N # Restartable attropl[1].name = pbs.ATTR_r attropl[1].value = 'y' # Walltime attropl[2].name = pbs.ATTR_l attropl[2].resource = 'walltime' attropl[2].value = '600' # Node requirements attropl[3].name = pbs.ATTR_l attropl[3].resource = 'nodes' attropl[3].value = '1:ppn=1' # attropl[4].name = pbs.ATTR_o attropl[5].name = pbs.ATTR_e rundir = os.path.join(os.path.split(os.path.abspath(fastafile))[0], "torque") if not os.path.exists(rundir): os.mkdir(rundir) params = { 'analysis': analysis, 'background':background, "single":single, "organism":organism } jobs = {} for t in toolio: if tools.has_key(t.name) and tools[t.name]: if t.use_width: for i in range(wmin, wmax + 1, step): logger.info("Starting %s job, width %s" % (t.name, i)) params['width'] = i sh = write_shell_script(t.name, fastafile, rundir=rundir, params=params) job_name = os.path.basename(os.path.splitext(sh)[0]) # submit attropl[0].value = job_name attropl[4].value = "{0}/{1}.stdout".format(rundir, job_name) attropl[5].value = "{0}/{1}.stderr".format(rundir, job_name) job_id = pbs.pbs_submit(c, attropl, sh, "batchq", 'NULL') e, e_txt = pbs.error() if e: logger.error("Failed: {0}".format(e_txt)) else: jobs[job_id] = job_name else: logger.debug("Starting %s job" % t.name) sh = write_shell_script(t.name, fastafile, rundir=rundir, params=params) job_name = os.path.basename(os.path.splitext(sh)[0]) # submit attropl[0].value = job_name attropl[4].value = "{0}/{1}.stdout".format(rundir, job_name) attropl[5].value = "{0}/{1}.stderr".format(rundir, job_name) job_id = pbs.pbs_submit(c, attropl, sh, "batchq", 'NULL') e, e_txt = pbs.error() if e: logger.error("Failed submission: {0}".format(e_txt)) else: jobs[job_id] = job_name else: logger.debug("Skipping %s" % t.name) ### Wait until all jobs are finished or the time runs out ### start_time = time() try: # Run until all jobs are finished while len(jobs) > 0 and not(max_time) or time() - start_time < max_time: for job_id,job_name in jobs.items(): job = q.getjob(job_id) if not job: # or not job.is_running(): motifs = [] if job: name = job['Job_Name'] # Some error checking here! else: pwmfile = os.path.join(rundir, "{0}.pwm".format(job_name)) if os.path.exists(pwmfile): motifs = read_motifs(open(pwmfile), fmt="pwm") else: logger.error("Job {0} finished, but couldn find {1}!".format(job_name, pwmfile)) stdout = open(os.path.join(rundir, "{0}.stdout".format(job_name))).read() stderr = open(os.path.join(rundir, "{0}.stderr".format(job_name))).read() result.add_motifs(job_id, (motifs, stdout, stderr)) #for fname in glob.glob("{0}*".format(job_name)): # logger.debug("Deleting {0}".format(fname)) # #os.unlink(fname) del jobs[job_id] sleep(5) ### Or the user gets impatient... ### except KeyboardInterrupt, e: # Destroy all running jobs logger.info("Caught interrupt, destroying all running jobs")
def pbsmon(): global NODES_PER_RACK, N_RACKS, PBS_STATES if len(sys.argv) > 1: pbs_server = sys.argv[1] else: pbs_server = pbs.pbs_default() if not pbs_server: print "No default pbs server, usage: %s [server]" % os.path.basename(sys.argv[0]) sys.exit(1) con = pbs.pbs_connect(pbs_server) if con < 0: errno, text = pbs.error() print errno, text sys.exit(1) # get the state of the nodes attrl = pbs.new_attrl(2) attrl[0].name = "state" attrl[1].name = "jobs" nodes = pbs.pbs_statnode(con, "", attrl, "NULL") node_dict = {} count_states = {} for key in PBS_STATES.keys(): count_states[key] = 0 for node in nodes: node_attr = node.attribs temp = string.split(node_attr[0].value, ",") state = temp[0] state_char = PBS_STATES[state] count_states[state] = count_states[state] + 1 if state == pbs.ND_free: if len(node_attr) > 1: # print 'TD: %s' % node.name, node_attr[1] state_char = PBS_STATES[pbs_ND_single] count_states[pbs.ND_free] = count_states[pbs.ND_free] - 1 count_states[pbs_ND_single] = count_states[pbs_ND_single] + 1 # print 'TD: %s %s' % (node.name, state_char) node_dict[node.name] = state_char legend = PBS_STATES.keys() legend.sort() # print nodes with gb-r%dn%d naming scheme print " ", for rack in xrange(1, N_RACKS + 1): print "%2d" % rack, print for node_nr in xrange(1, NODES_PER_RACK + 1): print "%2d" % node_nr, for rack in xrange(1, N_RACKS + 1): node_name = "gb-r%dn%d" % (rack, node_nr) if node_dict.has_key(node_name): print " %s" % node_dict[node_name], del node_dict[node_name] else: print " ", if node_nr - 1 < len(legend): state = legend[node_nr - 1] print " %s %-13s : %d" % (PBS_STATES[state], state, count_states[state]) else: print print # any other nodes? arr = node_dict.keys() if arr: arr.sort() for node in arr: print "%s %s" % (node, node_dict[node]) print
def submit(self): attropl = pbs.new_attropl(self.attribute_count + 1) attropl_idx = 0 attropl[attropl_idx].name = pbs.ATTR_v attropl[attropl_idx].value = self.generate_env() attropl_idx += 1 if self.name: attropl[attropl_idx].name = pbs.ATTR_N attropl[attropl_idx].value = self.name attropl_idx += 1 if self.walltime: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'walltime' attropl[attropl_idx].value = self.walltime attropl_idx += 1 if self.nodes: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'nodes' attropl[attropl_idx].value = self.nodes attropl_idx += 1 if self.stdout_path: attropl[attropl_idx].name = pbs.ATTR_o attropl[attropl_idx].value = self.stdout_path attropl_idx += 1 if self.stderr_path: attropl[attropl_idx].name = pbs.ATTR_o attropl[attropl_idx].value = self.stderr_path attropl_idx += 1 if self.dependency_list: attropl[attropl_idx].name = pbs.ATTR_depend attropl[attropl_idx].value = self.dependency_list attropl_idx += 1 if self.mail_options: attropl[attropl_idx].name = pbs.ATTR_m attropl[attropl_idx].value = self.mail_options attropl_idx += 1 if self.mem: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'mem' attropl[attropl_idx].value = self.mem attropl_idx += 1 if self.vmem: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'vmem' attropl[attropl_idx].value = self.vmem attropl_idx += 1 connection = pbs.pbs_connect(pbs.pbs_default()) self.job_id = pbs.pbs_submit(connection, attropl, self.job_script, None, None) pbs.pbs_disconnect(connection) e, e_msg = pbs.error() # the batch system returned an error, throw exception if e: message = "%d: %s" % (e, e_msg) raise Exception(message) return self.job_id
attropl[index].name = getattr(pbs, 'ATTR_' + attr) attropl[index].resource = resource attropl[index].value = value index += 1 else: attropl[index].name = getattr(pbs, 'ATTR_' + attr) attropl[index].value = value index += 1 return attropl if __name__ == '__main__': jp = JobParser() try: jp.read(sys.argv[1]) except IndexError: print("Usage: %s <jobscript>" % (sys.argv[0])) sys.exit(1) server_name = pbs.pbs_default() con = pbs.pbs_connect(server_name) job_id = pbs.pbs_submit(con, jp.get_attropl(), sys.argv[1], 'batch', 'NULL') e, e_txt = pbs.error() if e: print(e, e_txt) else: print(job_id)
def pbsmon(): global NODES_PER_RACK, N_RACKS, PBS_STATES if len(sys.argv) > 1: pbs_server = sys.argv[1] else: pbs_server = pbs.pbs_default() if not pbs_server: print 'No default pbs server, usage: %s [server]' % os.path.basename( sys.argv[0]) sys.exit(1) con = pbs.pbs_connect(pbs_server) if con < 0: errno, text = pbs.error() print errno, text sys.exit(1) # get the state of the nodes attrl = pbs.new_attrl(2) attrl[0].name = 'state' attrl[1].name = 'jobs' nodes = pbs.pbs_statnode(con, '', attrl, 'NULL') node_dict = {} count_states = {} for key in PBS_STATES.keys(): count_states[key] = 0 for node in nodes: node_attr = node.attribs temp = string.split(node_attr[0].value, ',') state = temp[0] state_char = PBS_STATES[state] count_states[state] = count_states[state] + 1 if state == pbs.ND_free: if len(node_attr) > 1: # print 'TD: %s' % node.name, node_attr[1] state_char = PBS_STATES[pbs_ND_single] count_states[pbs.ND_free] = count_states[pbs.ND_free] - 1 count_states[pbs_ND_single] = count_states[pbs_ND_single] + 1 # print 'TD: %s %s' % (node.name, state_char) node_dict[node.name] = state_char legend = PBS_STATES.keys() legend.sort() # print nodes with gb-r%dn%d naming scheme print ' ', for rack in xrange(1, N_RACKS + 1): print '%2d' % rack, print for node_nr in xrange(1, NODES_PER_RACK + 1): print '%2d' % node_nr, for rack in xrange(1, N_RACKS + 1): node_name = 'gb-r%dn%d' % (rack, node_nr) if node_dict.has_key(node_name): print ' %s' % node_dict[node_name], del node_dict[node_name] else: print ' ', if node_nr - 1 < len(legend): state = legend[node_nr - 1] print ' %s %-13s : %d' % (PBS_STATES[state], state, count_states[state]) else: print print # any other nodes? arr = node_dict.keys() if arr: arr.sort() for node in arr: print '%s %s' % (node, node_dict[node]) print
def submitJob(self, conn, job, task=None, requirements=''): """ Need to copy the inputsandbox to WN before submitting a job""" # Write a temporary submit script # NB: we assume an env var PBS_JOBCOOKIE points to the exec dir on the batch host ifiles = task['globalSandbox'].split(',') f = tempfile.NamedTemporaryFile() s = [] s.append('#!/bin/sh') if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) s.append( 'if [ ! -d $PBS_JOBCOOKIE ] ; then mkdir -p $PBS_JOBCOOKIE ; fi') s.append('cd $PBS_JOBCOOKIE') for ifile in task['globalSandbox'].split(','): s.append('cp ' + ifile + ' .') s.append(self.jobScriptDir + job['executable']+' '+ job['arguments'] +\ ' >' + job['standardOutput'] + ' 2>' + job['standardError']) if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) # this fails if the job is aborted, which leaks disc space. Adding an epilogue to make # sure it's gone for good - AMM 18/07/2011 s.append('rm -fr $PBS_JOBCOOKIE') f.write('\n'.join(s)) f.flush() epilogue = tempfile.NamedTemporaryFile() s = [] s.append('#!/bin/sh') if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) s.append('rm -fr $PBS_JOBCOOKIE') s.append('touch $HOME/done.$1') epilogue.write('\n'.join(s)) epilogue.flush() os.chmod(epilogue.name, 700) attr_dict = { 'Job_Name': 'CRAB_PBS', 'Variable_List': self.pbs_env, 'Output_Path': self.jobResDir + 'wrapper_' + str(job['standardOutput']), 'Error_Path': self.jobResDir + 'wrapper_' + str(job['standardError']) } attropl = pbs.new_attropl(len(attr_dict) + len(self.res_dict) + 1) i_attr = 0 for k in attr_dict.keys(): self.logging.debug("adding k %s" % k) attropl[i_attr].name = k attropl[i_attr].value = attr_dict[k] i_attr += 1 for k in self.res_dict.keys(): attropl[i_attr].name = 'Resource_List' attropl[i_attr].resource = k attropl[i_attr].value = self.res_dict[k] i_attr += 1 attropl[i_attr].name = 'Resource_List' attropl[i_attr].resource = 'epilogue' attropl[i_attr].value = epilogue.name self.logging.debug("adding epilogue: %s" % epilogue.name) i_attr += 1 jobid = pbs.pbs_submit(conn, attropl, f.name, self.queue, 'NULL') f.close() if not jobid: err, err_text = pbs.error() self.logging.error('Error in job submission') self.logging.error('PBS error code ' + str(err) + ': ' + err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err) + ': ' + err_text) return {job['name']: jobid}, None, None