def _process(self, batch_list): '''This function execute the change to the batch server''' if ARGS_VERBOSE: _print('class:SaraNodes func:_process input:%s' % str(batch_list), file=sys.stderr) ## Always get the pbs_server name, even in dry-run mode pbs_server = pbs.pbs_default() if not pbs_server: _print('Could not locate a pbs server', file=sys.stderr) sys.exit(1) if ARGS_VERBOSE: _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server, file=sys.stderr) ## If dry-run is not specified create a connection if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) ## Execute the changes for node in batch_list: if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node[0], node[1], 'NULL') if rcode > 0: errno, text = pbs.error() _print('PBS error for node \'%s\': %s (%s)' % (node[0], text, errno), file=sys.stderr) else: _print("pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')" % (node[0], str(node[1]))) ## Close the connection with the batch system if not ARGS_DRYRUN: pbs.pbs_disconnect(pbs_connection)
def _connect_to_server(server=None): """ open a connection to a pbs_server at hostname server, if server is None then connect to the default server. This function is shared between JobManager and TorqueJobRunner """ server_name = server if server else pbs.pbs_default() retry = 0 connection = pbs.pbs_connect(server_name) while connection <= 0 and retry < _MAX_RETRY: retry += 1 time.sleep(retry**2) connection = pbs.pbs_connect(server_name) if connection <= 0: e, e_msg = pbs.error() # the batch system returned an error, throw exception raise Exception("Error connecting to pbs_server. " "Torque error {0}: '{1}'".format( e, torque_strerror(e))) return connection
def update_all_jobs(batchserver_name): """ Update info about all jobs of the given batchserver. """ server,created = getBatchServer(batchserver_name) if not pbs_data_jobs.has_key(batchserver_name): pbs_data_jobs[batchserver_name] = {'last_update':None, 'jobs':{}} if pbs_data_jobs[batchserver_name]['last_update'] and (datetime.datetime.now()-pbs_data_jobs[batchserver_name]['last_update']).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate: logging.debug("jobs info is new enough for server: %s" % batchserver_name) print "not updated" return pbs_data_jobs print "updated" conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace')) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return statjobs = pbs.pbs_statjob(conn, "" , [], "") pbs.pbs_disconnect(conn) for sj in statjobs: jobid = sj.name attr_dict = dict([ (x.name,x.value) for x in sj.attribs]) attr_dict = {} for x in sj.attribs: if x.resource: attr_dict[x.name+"_"+x.resource] = x.value else: attr_dict[x.name] = x.value pbs_data_jobs[batchserver_name]['jobs'][jobid] = update_one_job_from_pbs_data(jobid, attr_dict) pbs_data_jobs[batchserver_name]['last_update'] = datetime.datetime.now() return pbs_data_jobs
def update_all_queues(batchserver_name): """ Update info about all queues for give batchserver. """ server,created = getBatchServer(batchserver_name) if server.queues_lastupdate and (datetime.datetime.now()-server.queues_lastupdate).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate: logging.debug("Queue info is new enough for server: %s" % batchserver_name) return conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace')) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return statqueues = pbs.pbs_statque(conn, "" , [], "") pbs.pbs_disconnect(conn) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return for sq in statqueues: queue,created = getQueue(sq.name, server) attr_dict = dict([ (x.name,x.value) for x in sq.attribs]) update_one_queue_from_pbs_data(queue, attr_dict) queue.save() server.queues_lastupdate = datetime.datetime.now() server.save()
def check_all_jobs(self): """ Returns a list of servers that failed to be contacted and a dict of "job_id : status" pairs (where status is a bunchified version of the API's structure. """ servers = [] failures = [] statuses = {} for pbs_job_state in self.watched: pbs_server_name = self.__get_pbs_server( pbs_job_state.job_destination.params) if pbs_server_name not in servers: servers.append(pbs_server_name) pbs_job_state.check_count += 1 for pbs_server_name in servers: c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: log.debug( "connection to PBS server %s for state check failed" % pbs_server_name) failures.append(pbs_server_name) continue stat_attrl = pbs.new_attrl(3) stat_attrl[0].name = pbs.ATTR_state stat_attrl[1].name = pbs.ATTR_used stat_attrl[2].name = pbs.ATTR_exitstat jobs = pbs.pbs_statjob(c, None, stat_attrl, None) pbs.pbs_disconnect(c) statuses.update(self.convert_statjob_to_bunches(jobs)) return ((failures, statuses))
def stop_job( self, job ): """Attempts to delete a job from the PBS queue""" job_id = job.get_job_runner_external_id().encode('utf-8') job_tag = "(%s/%s)" % ( job.get_id_tag(), job_id ) log.debug( "%s Stopping PBS job" % job_tag ) # Declare the connection handle c so that it can be cleaned up: c = None try: pbs_server_name = self.__get_pbs_server( job.destination_params ) if pbs_server_name is None: log.debug("(%s) Job queued but no destination stored in job params, cannot delete" % job_tag ) return c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: log.debug("(%s) Connection to PBS server for job delete failed" % job_tag ) return pbs.pbs_deljob( c, job_id, '' ) log.debug( "%s Removed from PBS queue before job completion" % job_tag ) except: e = traceback.format_exc() log.debug( "%s Unable to stop job: %s" % ( job_tag, e ) ) finally: # Cleanup: disconnect from the server. if ( None is not c ): pbs.pbs_disconnect( c )
def __init__(self, script, name, env_vars=None, resources={}, conn=None, ppn=None): """ create a new Job to be submitted to PBS env_vars is a dictionary with key-value pairs of environment variables that should be passed on to the job resources is a dictionary with optional keys: ['hours', 'cores'] both of these should be integer values. hours can be 1 - MAX_WALLTIME, cores depends on which cluster it is being run. """ self.clean_conn = True self.log = fancylogger.getLogger(self.__class__.__name__, fname=False) self.script = script if env_vars: self.env_vars = env_vars.copy() else: self.env_vars = {} self.name = name if pbs_import_failed: self.log.error(pbs_import_failed) try: self.pbs_server = pbs.pbs_default() if conn: self.pbsconn = conn self.clean_conn = False else: self.pbsconn = pbs.pbs_connect(self.pbs_server) except Exception, err: self.log.error("Failed to connect to the default pbs server: %s" % err)
def main(): pbs_server = pbs.pbs_default() if not pbs_server: print 'No default server' sys.exit(1) if len(sys.argv) < 2: print "Usage: set_property.py <hostname>" sys.exit(1) hostname = sys.argv[1] con = pbs.pbs_connect(pbs_server) attrop_l = pbs.new_attropl(1) attrop_l[0].name = 'note' attrop_l[0].value = 'set_something_useful' attrop_l[0].op = pbs.SET r = pbs.pbs_manager(con, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, hostname, attrop_l, 'NULL') if r > 0: print r, ";" errno, text = pbs.error() print errno, text
def stop_job(self, job): """Attempts to delete a job from the PBS queue""" job_id = job.get_job_runner_external_id().encode('utf-8') job_tag = "(%s/%s)" % (job.get_id_tag(), job_id) log.debug("%s Stopping PBS job" % job_tag) # Declare the connection handle c so that it can be cleaned up: c = None try: pbs_server_name = self.__get_pbs_server(job.destination_params) if pbs_server_name is None: log.debug( "(%s) Job queued but no destination stored in job params, cannot delete" % job_tag) return c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: log.debug( "(%s) Connection to PBS server for job delete failed" % job_tag) return pbs.pbs_deljob(c, job_id, '') log.debug("%s Removed from PBS queue before job completion" % job_tag) except: e = traceback.format_exc() log.debug("%s Unable to stop job: %s" % (job_tag, e)) finally: # Cleanup: disconnect from the server. if (None is not c): pbs.pbs_disconnect(c)
def alive(self, process_id): alive = False try: status = self.pbsquery.getjob(str(process_id))['job_state'][0] except: # job not found status = -1 sys.stderr.write("EXC: %s\n" % str(sys.exc_info()[0])) sys.stderr.write("Could not find job for process id %d\n" % process_id) if status == 'Q': sys.stderr.write("Job %d waiting in queue.\n" % (process_id)) alive = True elif status == 'R': sys.stderr.write("Job %d is running.\n" % (process_id)) alive = True elif status in ['H','S']: sys.stderr.write("Job %d is held or suspended.\n" % (process_id)) alive = False if not alive: try: # Kill the job. c = pbs.pbs_connect(pbs.pbs_default()) result = pbs.pbs_deljob(c, str(process_id)) sys.stderr.write("Killed job %d.\n" % (process_id)) except: sys.stderr.write("Failed to kill job %d.\n" % (process_id)) return False else: return True
def update_all_nodes(batchserver_name): """ Update info about all nodes of the given batchserver. """ server,created = getBatchServer(batchserver_name) if not pbs_data_nodes.has_key(batchserver_name): pbs_data_nodes[batchserver_name] = {'last_update':None, 'nodes':{}} if pbs_data_nodes[batchserver_name]['last_update'] and (datetime.datetime.now()-pbs_data_nodes[batchserver_name]['last_update']).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate: logging.debug("Nodes info is new enough for server: %s" % batchserver_name) print "not updated" return pbs_data_nodes print "updated" conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace')) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return statnodes = pbs.pbs_statnode(conn, "" , [], "") pbs.pbs_disconnect(conn) for sn in statnodes: node,created = getNode(sn.name, server) attr_dict = dict([ (x.name,x.value) for x in sn.attribs]) pbs_data_nodes[batchserver_name]['nodes'][node] = update_one_node_from_pbs_data(node, attr_dict) pbs_data_nodes[batchserver_name]['last_update'] = datetime.datetime.now() return pbs_data_nodes
def submit_get_subfamilies_job(job): # This is how we are passing the fasta and job id to the script server_name = pbs.pbs_default() c = pbs.pbs_connect(server_name) attropl = pbs.new_attropl(5) attropl[0].name = pbs.ATTR_N attropl[0].value = "FAT-CAT Get Sub-Families: %s" % job.id attropl[1].name = pbs.ATTR_l attropl[1].resource = 'nodes' attropl[1].value = '1:ppn=1' attropl[2].name = pbs.ATTR_o attropl[2].value = JOB_LOG_FILE attropl[3].name = pbs.ATTR_e attropl[3].value = JOB_LOG_FILE attropl[4].name = pbs.ATTR_v attropl[4].value = "job_id=%s" % (job.id) job.status_id = 5 job.save() job_id = pbs.pbs_submit(c, attropl, "/clusterfs/ohana/software/fatcat/scripts/get_best_nodes.py", 'web', 'NULL') logger.info("Submitting %s to the grid to get best nodes with id %s" % (job.id, job_id)) if job_id: job.get_best_nodes_pbs_job_id = job_id job.save() pbs.pbs_disconnect(c) return job_id
def main(): server = pbs.pbs_default() c = pbs.pbs_connect(server) nodes = pbs.pbs_statnode(c, '', 'NULL', 'NULL') for node in nodes: print node.name, ' :' attrs = node.attribs for attr in attrs: print '\t%s = %s' %(attr.name, attr.value) try: mom_port = socket.getservbyname('pbs_resmon', 'tcp') except socket.error: mom_port = pbs.PBS_MANAGER_SERVICE_PORT mom_id = pbs.openrm(node.name, mom_port) mom_keys = pbs.get_mom_values(mom_id) for key in mom_keys.keys(): print '\t%s = %s' %(key, mom_keys[key]) print '\nTesting list with user supplied keywords' l = [ 'bas', 'ncpus', 'loadave' ] mom_keys = pbs.get_mom_values(mom_id, l) for key in mom_keys.keys(): print '\t%s = %s' %(key, mom_keys[key]) print '' pbs.closerm(mom_id)
def queues_page(): conn = pbs.pbs_connect(pbsserver) queues = get_queues(conn) pbs.pbs_disconnect(conn) queues = queue_attributes_reformat(queues) now = datetime.datetime.now().strftime('%Y.%m.%d at %I:%M:%S %P') return {'now': now, 'queues': queues}
def jobs_page(): conn = pbs.pbs_connect(pbsserver) jobs = get_jobs(conn) pbs.pbs_disconnect(conn) jobs = job_attributes_reformat(jobs) now = datetime.datetime.now().strftime('%Y.%m.%d at %I:%M:%S %P') return {'now': now, 'jobs': jobs}
def main(): server = pbs.pbs_default() c = pbs.pbs_connect(server) nodes = pbs.pbs_statnode(c, '', 'NULL', 'NULL') for node in nodes: print node.name, ' :' attrs = node.attribs for attr in attrs: print '\t%s = %s' % (attr.name, attr.value) try: mom_port = socket.getservbyname('pbs_resmon', 'tcp') except socket.error: mom_port = pbs.PBS_MANAGER_SERVICE_PORT mom_id = pbs.openrm(node.name, mom_port) mom_keys = pbs.get_mom_values(mom_id) for key in mom_keys.keys(): print '\t%s = %s' % (key, mom_keys[key]) print '\nTesting list with user supplied keywords' l = ['bas', 'ncpus', 'loadave'] mom_keys = pbs.get_mom_values(mom_id, l) for key in mom_keys.keys(): print '\t%s = %s' % (key, mom_keys[key]) print '' pbs.closerm(mom_id)
def check_all_jobs( self ): """ Returns a list of servers that failed to be contacted and a dict of "job_id : status" pairs (where status is a bunchified version of the API's structure. """ servers = [] failures = [] statuses = {} for pbs_job_state in self.watched: pbs_server_name = self.__get_pbs_server(pbs_job_state.job_destination.params) if pbs_server_name not in servers: servers.append( pbs_server_name ) pbs_job_state.check_count += 1 for pbs_server_name in servers: c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: log.debug("connection to PBS server %s for state check failed" % pbs_server_name ) failures.append( pbs_server_name ) continue stat_attrl = pbs.new_attrl(3) stat_attrl[0].name = pbs.ATTR_state stat_attrl[1].name = pbs.ATTR_used stat_attrl[2].name = pbs.ATTR_exitstat jobs = pbs.pbs_statjob( c, None, stat_attrl, None ) pbs.pbs_disconnect( c ) statuses.update( self.convert_statjob_to_bunches( jobs ) ) return( ( failures, statuses ) )
def connect(self, server=None): if not server: server = pbs.pbs_default() self._connection_id = pbs.pbs_connect(server) if not self._connection_id: raise PBSException('could not connect to pbs server ' + str(server))
def pbs_conn(self): conn=pbs.pbs_connect(pbs.pbs_default()) if(conn<0): err, err_text = pbs.error() self.logging.error('Error in PBS server conncet') self.logging.error('PBS error code '+str(err)+': '+err_text) raise SchedulerError('PBS error', str(err)+': '+err_text) return conn
def nodes_page(): conn = pbs.pbs_connect(pbsserver) nodes = get_nodes(conn) pbs.pbs_disconnect(conn) nodes = node_attributes_reformat(nodes) node_totals = get_node_totals(nodes) now = datetime.datetime.now().strftime('%Y.%m.%d at %I:%M:%S %P') return {'now': now, 'nodes': nodes, 'node_totals': node_totals}
def connect_to_server(pbs_server=None): """Connect to PBS server and return connection.""" if pbs_import_failed: raise EasyBuildError(pbs_import_failed) if not pbs_server: pbs_server = pbs.pbs_default() return pbs.pbs_connect(pbs_server)
def pbs_conn(self): conn = pbs.pbs_connect(pbs.pbs_default()) if (conn < 0): err, err_text = pbs.error() self.logging.error('Error in PBS server conncet') self.logging.error('PBS error code ' + str(err) + ': ' + err_text) raise SchedulerError('PBS error', str(err) + ': ' + err_text) return conn
def del_job(self, job_id, server): c = pbs.pbs_connect(str( pbs.pbs_default())) # Create new connection for the child process if server is None: server = pbs.pbs_default() job_full_id = job_id + '.' + server result = pbs.pbs_deljob(c, job_full_id, 'NULL') return result # If operation is successfull, result == 0
def connect_to_server(pbs_server=None): """Connect to PBS server and return connection.""" if pbs_import_failed: _log.error(pbs_import_failed) return None if not pbs_server: pbs_server = pbs.pbs_default() return pbs.pbs_connect(pbs_server)
def method1(): pbs_server = pbs.pbs_default() if not pbs_server: print "No default pbs server" sys.exit(1) con = pbs.pbs_connect(pbs_server) if con == -1: print "Default pbs server connection failed" pbs_server = pbs.pbs_fbserver() if not pbs_server: print "No pbs fallback server" sys.exit(1) else: con = pbs.pbs_connect(pbs_server) if con == -1: print "pbs fallback server connection failed" sys.exit(1) print "Connected to %s" %(pbs_server)
def method1(): pbs_server = pbs.pbs_default() if not pbs_server: print "No default pbs server" sys.exit(1) con = pbs.pbs_connect(pbs_server) if con == -1: print "Default pbs server connection failed" pbs_server = pbs.pbs_fbserver() if not pbs_server: print "No pbs fallback server" sys.exit(1) else: con = pbs.pbs_connect(pbs_server) if con == -1: print "pbs fallback server connection failed" sys.exit(1) print "Connected to %s" % (pbs_server)
def _connect_to_server(server): """ open a connection to a pbs_server at hostname server, if server is None then connect to the default server. This function is shared between JobManager and TorqueJobRunner """ if server: connection = pbs.pbs_connect(server) else: connection = pbs.pbs_connect(pbs.pbs_default()) if connection <= 0: e, e_msg = pbs.error() # the batch system returned an error, throw exception raise Exception("Error connecting to pbs_server. " "Torque error {0}: '{1}'".format( e, torque_strerror(e))) return connection
def _process(self, batch_list): '''This function execute the change to the batch server''' if ARGS_VERBOSE: _print('class:SaraNodes func:_process input:%s' % str(batch_list), file=sys.stderr) ## Always get the pbs_server name, even in dry-run mode pbs_server = pbs.pbs_default() if not pbs_server: _print('Could not locate a pbs server', file=sys.stderr) sys.exit(1) if ARGS_VERBOSE: _print('class:SaraNodes func:_process pbs_server:%s' % pbs_server, file=sys.stderr) ## If dry-run is not specified create a connection if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) ## Execute the changes for node in batch_list: if not ARGS_DRYRUN: pbs_connection = pbs.pbs_connect(pbs_server) rcode = pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node[0], node[1], 'NULL') if rcode > 0: errno, text = pbs.error() _print('PBS error for node \'%s\': %s (%s)' % (node[0], text, errno), file=sys.stderr) else: _print( "pbs.pbs_manager(pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, %s, %s, 'NULL')" % (node[0], str(node[1]))) ## Close the connection with the batch system if not ARGS_DRYRUN: pbs.pbs_disconnect(pbs_connection)
def print_header(): # try connecting to the PBS server # print " " print "Content-Type: text/html" # print " " # print " " try: con = pbs.pbs_connect(pbs_server) nodes = pbs.pbs_statnode(con, "", "NULL", "NULL") except pbserr, error: print "<h1>Error connecting to PBS server:</h1><tt>",error,"</tt>" sys.exit(1)
def __init__(self, options): super(Pbs, self).__init__(options) self.log = fancylogger.getLogger(self.__class__.__name__, fname=False) self.options = options self.log.debug("Provided options %s", options) self.pbs_server = pbs.pbs_default() self.pbsconn = pbs.pbs_connect(self.pbs_server) self.vars = { 'cwd': 'PBS_O_WORKDIR', 'jobid': 'PBS_JOBID', }
def submit_fxn_site_prediction_job(job): # This is how we are passing the fasta and job id to the script server_name = pbs.pbs_default() c = pbs.pbs_connect(server_name) print server_name print c attropl = pbs.new_attropl(7) attropl[0].name = pbs.ATTR_N attropl[0].value = "Functional Site Prediction Job: %s" % job.id attropl[1].name = pbs.ATTR_l attropl[1].resource = 'nodes' attropl[1].value = '1:ppn=1' attropl[2].name = pbs.ATTR_o attropl[2].value = JOB_LOG_FILE attropl[3].name = pbs.ATTR_e attropl[3].value = JOB_LOG_FILE attropl[4].name = pbs.ATTR_v attropl[4].value = "job_id=%s" % (job.id) attropl[5].name = pbs.ATTR_r attropl[5].value = 'y' attropl[6].name = pbs.ATTR_l attropl[6].resource = 'walltime' attropl[6].value = '1000' job.status_id = 2 job.save() job_id = pbs.pbs_submit( c, attropl, "/home/cyrus_afrasiabi/ohana_repository/bpg/fxn_site_prediction.py", 'web', 'NULL') logger.info( "Submitting %s to the grid to get functional site predictions with id %s" % (job.id, job_id)) if job_id: job.pbs_job_id = job_id job.save() pbs.pbs_disconnect(c) return job_id
def info(self, types=None): """ Return jobinfo """ if not self.jobid: self.log.debug("no jobid, job is not submitted yet?") return None # convert single type into list if type(types) is str: types = [types] self.log.debug("Return info types %s" % types) # create attribute list to query pbs with if types is None: jobattr = NULL else: jobattr = pbs.new_attrl(len(types)) for idx, attr in enumerate(types): jobattr[idx].name = attr # get a new connection (otherwise this seems to fail) if self.clean_conn: pbs.pbs_disconnect(self.pbsconn) self.pbsconn = pbs.pbs_connect(self.pbs_server) jobs = pbs.pbs_statjob(self.pbsconn, self.jobid, jobattr, NULL) if len(jobs) == 0: # no job found, return None info res = None self.log.debug( "No job found. Wrong id %s or job finished? Returning %s" % (self.jobid, res)) return res elif len(jobs) == 1: self.log.debug("Request for jobid %s returned one result %s" % (self.jobid, jobs)) else: self.log.error( "Request for jobid %s returned more then one result %s" % (self.jobid, jobs)) # only expect to have a list with one element j = jobs[0] # convert attribs into useable dict job_details = dict([(attrib.name, attrib.value) for attrib in j.attribs]) # manually set 'id' attribute job_details['id'] = j.name self.log.debug("Found jobinfo %s" % job_details) return job_details
def check_single_job( self, pbs_server_name, job_id ): """ Returns the state of a single job, used to make sure a job is really dead. """ c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: log.debug("connection to PBS server %s for state check failed" % pbs_server_name ) return None stat_attrl = pbs.new_attrl(1) stat_attrl[0].name = pbs.ATTR_state jobs = pbs.pbs_statjob( c, job_id, stat_attrl, None ) pbs.pbs_disconnect( c ) return jobs[0].attribs[0].value
def runAsDaemon(): """ Run in daemon mode """ # TODO: detach from console and log in syslog for bs in BatchServer.objects.all(): conn = pbs.pbs_connect(bs.name.encode('iso-8859-1', 'replace')) if conn == -1: log(LOG_ERROR, "Cannot connect to batch server %s" % bs.name) continue update_all_queues(conn,bs) # update_all_jobs(conn,bs) update_all_nodes(conn,bs)
def main(): pbs_server = pbs.pbs_default() if not pbs_server: print "No default pbs server" sys.exit(1) con = pbs.pbs_connect(pbs_server) nodes = pbs.pbs_statnode(con, "", "NULL", "NULL") for node in nodes: print node.name for attrib in node.attribs: print '\t', attrib.name, '=', attrib.value
def submit_intrepid_job(job): # This is how we are passing the fasta and job id to the script server_name = pbs.pbs_default() c = pbs.pbs_connect(server_name) attropl = pbs.new_attropl(6) attropl[0].name = pbs.ATTR_N attropl[0].value = "INTREPID Job: %s" % job.id attropl[1].name = pbs.ATTR_l attropl[1].resource = 'nodes' attropl[1].value = '1:ppn=8' attropl[2].name = pbs.ATTR_o attropl[2].value = JOB_LOG_FILE attropl[3].name = pbs.ATTR_e attropl[3].value = JOB_LOG_FILE attropl[4].name = pbs.ATTR_v attropl[4].value = "job_id=%s" % (job.id) attropl[5].name = pbs.ATTR_l attropl[5].resource = 'walltime' attropl[5].value = '48:00:00' if job.development_job: job_id = pbs.pbs_submit( c, attropl, "/clusterfs/ohana/software/intrepid/scripts/intrepid_development_pipeline.py", 'web', 'NULL') else: job_id = pbs.pbs_submit( c, attropl, "/clusterfs/ohana/software/intrepid/scripts/intrepid_pipeline.py", 'web', 'NULL') logger.info("Submitting %s to the grid with id %s" % (job.id, job_id)) if job_id: job.pbs_job_id = job_id job.status_id = JOB_SUBMITTED job.save() else: pass pbs.pbs_disconnect(c) return job_id
def submitScript(script): result = {} try: pbs_connection = pbs.pbs_connect(pbs.pbs_default()) # queues = pbs.pbs_statque(pbs_connection, "batch", "NULL", "NULL") attropl = pbs.new_attropl(4) # Set the name of the job # attropl[0].name = pbs.ATTR_N attropl[0].value = str(script['jobName']) if script['jobName'] else "new_job" # Job is Rerunable # attropl[1].name = pbs.ATTR_r attropl[1].value = 'y' # Walltime # attropl[2].name = pbs.ATTR_l attropl[2].resource = 'walltime' attropl[2].value = str(script['maxTime']) if script['maxTime'] else '01:00:00' # Nodes # attropl[3].name = pbs.ATTR_l attropl[3].resource = 'nodes' attropl[3].value = '1:ppn=' + str(script['cpuNumber']) if script['cpuNumber'] else '1' # A1.tsk is the job script filename # job_id = pbs.pbs_submit(pbs_connection, attropl, str(script['scriptName']), str(script['queue']), 'NULL') e, e_txt = pbs.error() if e: result['Result'] = 'ERROR' result['Message'] = str(e) + ' : ' + e_txt else: result['Result'] = 'OK' result['Message'] = job_id except Exception as exc: result['Result'] = 'ERROR' result['Message'] = str(exc) return result
def __init__(self): self.targets = collections.defaultdict(list) self.default = "" # Construct self.attrs from available attributes in the pbs module # this provides a mapping from human readable names (no spaces) to # the module ATTR_* names. Not all ATTR_ entities are interesting. self.attrs = {} pbs_module_attrs = [a for a in dir(pbs) if a[0:5] == "ATTR_"] for attr in pbs_module_attrs: self.attrs[getattr(pbs, attr)] = str srvname = pbs.pbs_default() self.conn = pbs.pbs_connect(srvname) # By default, submit jobs to pbs self.pbs(True) self.dotAliases = {}
def main(): pbs_server = pbs.pbs_default() if not pbs_server: print 'No default server' sys.exit(1) con = pbs.pbs_connect(pbs_server) attr_l = pbs.new_attrl(1) attr_l[0].name = 'pbs_version' server_info = pbs.pbs_statserver(con, attr_l, 'NULL') for entry in server_info: print entry.name for attrib in entry.attribs: print '\t', attrib.name, ' = ', attrib.value
def info(self, types=None): """ Return jobinfo """ if not self.jobid: self.log.debug("no jobid, job is not submitted yet?") return None # convert single type into list if type(types) is str: types = [types] self.log.debug("Return info types %s" % types) # create attribute list to query pbs with if types is None: jobattr = NULL else: jobattr = pbs.new_attrl(len(types)) for idx, attr in enumerate(types): jobattr[idx].name = attr # get a new connection (otherwise this seems to fail) if self.clean_conn: pbs.pbs_disconnect(self.pbsconn) self.pbsconn = pbs.pbs_connect(self.pbs_server) jobs = pbs.pbs_statjob(self.pbsconn, self.jobid, jobattr, NULL) if len(jobs) == 0: # no job found, return None info res = None self.log.debug("No job found. Wrong id %s or job finished? Returning %s" % (self.jobid, res)) return res elif len(jobs) == 1: self.log.debug("Request for jobid %s returned one result %s" % (self.jobid, jobs)) else: self.log.error("Request for jobid %s returned more then one result %s" % (self.jobid, jobs)) # only expect to have a list with one element j = jobs[0] # convert attribs into useable dict job_details = dict([ (attrib.name, attrib.value) for attrib in j.attribs ]) # manually set 'id' attribute job_details['id'] = j.name self.log.debug("Found jobinfo %s" % job_details) return job_details
def update_one_queue(queue): """ Update live info about the given queue """ conn = pbs.pbs_connect(queue.server.name.encode('iso-8859-1', 'replace')) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return statqueues = pbs.pbs_statque(conn, queue.name.encode('iso-8859-1', 'replace') , [], "") pbs.pbs_disconnect(conn) if len(statqueues)==0: logging.error("pbs_statque failed for queue: %s" % queue.name) return if len(statqueues)>1: logging.warning("pbs_statque returned more than one records for queue: %s" % queue.name) attr_dict = dict([ (x.name,x.value) for x in statqueues[0].attribs]) update_one_queue_from_pbs_data(queue, attr_dict) queue.save()
def hold_rls_job(self, job_id, server, mode, permission): ''' Example: job_id: 183 server: jim-desktop mode: hold | rls permission: u | o | s ''' c = pbs.pbs_connect(str( pbs.pbs_default())) # Create new connection for the child process if server is None: server = pbs.pbs_default() job_full_id = job_id + '.' + server if mode == 'hold': result = pbs.pbs_holdjob(c, job_full_id, permission, 'NULL') elif mode == 'rls': result = pbs.pbs_rlsjob(c, job_full_id, permission, 'NULL') return result # If operation is successfull, result == 0
def main(): pbs_server = pbs.pbs_default() if not pbs_server: print 'No default server' sys.exit(1) con = pbs.pbs_connect(pbs_server) attrop_l = pbs.new_attropl(1) attrop_l[0].name = 'properties' attrop_l[0].value = 'set_something_useful' attrop_l[0].op = pbs.INCR r = pbs.pbs_manager(con, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, "e2", attrop_l, 'NULL') if r > 0: print r, ";" errno, text = pbs.error() print errno, text
def shell_test(): ''' Function to do some tests/debugging when running in the shell. This is only used for debugging. ''' print 'Running in the shell only.' conn = pbs.pbs_connect(pbsserver) if conn < 0: print 'Error connecting to PBS server.' print 'Have you set the PBS server hostname in this code?' sys.exit(1) # Uncomment one or more of the sections below to print info on the nodes, # queues and jobs. # Print nodes information nodes = get_nodes(conn) nodes = node_attributes_reformat(nodes) for node in sorted(nodes): print 'Node Name: %s' % node['node_name'] print ' Mem: ', node['resources_assigned_mem'], '/', node['resources_available_mem'], \ 'GB = ', '%3d' % node['mem_ratio'], '% used' print ' Cores:', node['resources_assigned_ncpus'],'/', node['resources_available_ncpus'], \ '=', '%3d' % node['cpu_ratio'], '% used' #for key in node.keys(): # print ' ', key, ' = ', node[key] print '\nNode Totals: ' print get_node_totals(nodes) ''' # Print queues information queues = get_queues(conn) queues = queue_attributes_reformat(queues) for queue in queues: print '------ Queue Name: %s ------' % queue['queue_name'] for key in queue.keys(): print ' ', key, ' = ', queue[key] ''' '''
def create_job(self, username, Job_Name, queue, nodes, walltime, file): c = pbs.pbs_connect(str(pbs.pbs_default())) attrl = pbs.new_attropl(3) attrl[0].name = pbs.ATTR_N attrl[0].value = str(Job_Name) attrl[1].name = pbs.ATTR_l attrl[1].resource = 'nodes' attrl[1].value = str(nodes) attrl[2].name = pbs.ATTR_l attrl[2].resource = 'walltime' attrl[2].value = str(walltime) queue = str(queue) task_id = pbs.pbs_submit(c, attrl, str("media/" + username + "/" + file), queue, 'NULL') return pbs.pbs_geterrmsg(c)
def __init__(self, settings): # spawn own thread, start up queue, start connection to server self.queue = [] self.finished = [] self.running = [] self.error = [] self.job_ids = [] self.threads = [] self.use_cluster = False self.connection = None self.curr_id = 0 self.settings = None running_threads = 0 max_threads = 1 self.settings = settings if settings["global"]["use_cluster"] == True: import pbs self.use_cluster = True else: self.use_cluster = False self.max_threads = settings["global"]["n_processors"] if self.use_cluster: # Establish connection to PBS server serv_addr = settings["global"]["cluster_address"] # Let the cluster's jobman handle scheduling self.max_threads = sys.maxint self.connection = pbs.pbs_connect(serv_addr) if self.connection < 0: errno, text = pbs.error() print "Error, unable to establish connection to PBS server." print errno, text sys.exit(1)
def _connect(self): """Connect to the PBS/Torque server""" self.con = pbs.pbs_connect(self.server) if self.con < 0: str = "Could not make a connection with %s\n" %(self.server) raise PBSError(str)
def connect_to_server(self): """Connect to PBS server, set and return connection.""" if not self.conn: self.conn = pbs.pbs_connect(self.pbs_server) return self.conn
job_file = "%s/database/pbs/%s.sh" % (os.getcwd(), job_name) fh = file(job_file, "w") fh.write(script) fh.close() # define job attributes ofile = "%s/database/pbs/%s.o" % (os.getcwd(), job_name) efile = "%s/database/pbs/%s.e" % (os.getcwd(), job_name) job_attrs = pbs.new_attropl(2) job_attrs[0].name = pbs.ATTR_o job_attrs[0].value = ofile job_attrs[1].name = pbs.ATTR_e job_attrs[1].value = efile # get a handle conn = pbs.pbs_connect(pbs_server) # queue it if os.access(job_file, os.R_OK): log.debug("submitting file %s with output %s and error %s" % (job_file, ofile, efile) ) log.debug("command is: %s" % command_line) job_id = pbs.pbs_submit(conn, job_attrs, job_file, None, None) # monitor if job_id: p = PBSQuery() job_data = p.getjob(job_id) old_state = job_data[job_id]["job_state"] log.debug("initial state is %s" % old_state) running = False while True: