def main(args): """Main script.""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users', None, 'store_true', False), } opts = ExtendedSimpleOption(options) try: vsc_config = VscConfiguration() LdapQuery(vsc_config) grace_users = get_user_with_status('grace') inactive_users = get_user_with_status('inactive') pbs_query = PBSQuery() t = time.ctime() jobs = pbs_query.getjobs() # we just get them all removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users, opts.options.dry_run) removed_running = remove_running_jobs(jobs, inactive_users, opts.options.dry_run) if opts.options.mail_report and not opts.options.dry_run: if len(removed_queued) > 0 or len(removed_running) > 0: mail_report(t, removed_queued, removed_running) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): p = PBSQuery() p.new_data_structure() #job = p.getjob('2983215') #print job['substate'] #print job.substate #print job.queue #print job.Resource_List #print job.Resource_List.nodes #print job.Resource_List.arch #print job.Variable_List.PBS_O_HOME l = ['np', 'state' ] node = p.getnode("gb-r5n1", l) print node.name, node['np'] sys.exit(0) #nodes = p.getnodes(l) for id in nodes: print id try: print nodes[id].np print nodes[id].status.arch print nodes[id].status.uname print nodes[id].state except PBSError, detail: print detail pass
def _get_ppn(self): """Guess PBS' `ppn` value for a full node.""" # cache this value as it's not likely going to change over the # `eb` script runtime ... if not self._ppn: pq = PBSQuery() node_vals = pq.getnodes().values( ) # only the values, not the names interesting_nodes = ( 'free', 'job-exclusive', ) res = {} for np in [ int(x['np'][0]) for x in node_vals if x['state'][0] in interesting_nodes ]: res.setdefault(np, 0) res[np] += 1 if not res: raise EasyBuildError( "Could not guess the ppn value of a full node because " + "there are no free or job-exclusive nodes.") # return most frequent freq_count, freq_np = max([(j, i) for i, j in res.items()]) self.log.debug( "Found most frequent np %s (%s times) in interesting nodes %s" % (freq_np, freq_count, interesting_nodes)) self._ppn = freq_np return self._ppn
def main(): p = PBSQuery() p.new_data_structure() #job = p.getjob('2983215') #print job['substate'] #print job.substate #print job.queue #print job.Resource_List #print job.Resource_List.nodes #print job.Resource_List.arch #print job.Variable_List.PBS_O_HOME l = ['np', 'state'] #node = p.getnode("r2n2", l) #print node.name #print node.name, node['np'] #sys.exit(0) nodes = p.getnodes(l) for id in nodes: print id try: print nodes[id].np #print nodes[id].status.arch print nodes[id].status.uname print nodes[id].state except PBSError, detail: print detail pass
def get_ppn(): """Guess the ppn for full node""" log = fancylogger.getLogger('pbs_job.get_ppn') pq = PBSQuery() node_vals = pq.getnodes().values() # only the values, not the names interesting_nodes = ( 'free', 'job-exclusive', ) res = {} for np in [ int(x['np'][0]) for x in node_vals if x['state'][0] in interesting_nodes ]: res.setdefault(np, 0) res[np] += 1 # return most frequent freq_count, freq_np = max([(j, i) for i, j in res.items()]) log.debug("Found most frequent np %s (%s times) in interesting nodes %s" % (freq_np, freq_count, interesting_nodes)) return freq_np
def main(): p = PBSQuery() p.new_data_structure() nodes = p.getnodes() l=list() jobs = "none" for id in nodes: try: queue = nodes[id].properties[0] state = nodes[id].state[0] power = nodes[id].power_state[0] np = nodes[id].np[0] name = nodes[id].name memory = nodes[id].status.physmem[0] memory = memory[:-2] memory = int(memory) memory = memory/1000000 load = nodes[id].status.loadave[0] display = " " if hasattr(nodes[id],"jobs"): jobs = nodes[id].jobs results = len(jobs) for result in range(results): display += "x" #result = str() #s = ", " #result = s.join(jobs) else: display = "0" l.append([name,state,power,queue,np,memory,load,display]) except PBSError, detail: print detail pass
def countppn(queue): p = PBSQuery() p.new_data_structure() jobs = p.getjobs() nptot = 0 for id in jobs: try: if jobs[id].queue[0] == queue and jobs[id].job_state[0] == 'R': np = jobs[id].Resource_List.nodes if 'ppn' not in np[0]: np = 1 else: npptot = 0 ct = [m.start() for m in re.finditer('ppn', np[0])] for val in ct: char = np[0] vals = val+4 valf = val+6 npp = char[vals:valf] npp = re.sub('[!@#+:$]', '', npp) npp = int(npp) npptot = npp + npptot np = npptot nptot = np + nptot except PBSError, detail: print detail pass
def main(): """Main script.""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users', None, 'store_true', False), } opts = ExtendedSimpleOption(options) try: vsc_config = VscConfiguration(VSC_CONF_DEFAULT_FILENAME) LdapQuery(vsc_config) grace_users = get_user_with_status('grace') inactive_users = get_user_with_status('inactive') pbs_query = PBSQuery() t = time.ctime() jobs = pbs_query.getjobs() # we just get them all removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users) removed_running = remove_running_jobs(jobs, inactive_users) if opts.options.mail_report and not opts.options.dry_run: if len(removed_queued) > 0 or len(removed_running) > 0: mail_report(t, removed_queued, removed_running) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def pbs_handler(name): pbs = PBSQuery() jobs = pbs.getjobs() for jobid, jobinfo in jobs.iteritems(): publish(jobid, jobinfo) publish_queue_state(pbs) return ""
def main(): """ Main script. """ options = { "jobid": ("The PBS_JOBID of the job for which we want information", None, "store", None), "information": ( "Comma-separated list of the job info to print. " "Entries of the format input_key:output_key", None, "store", None, ), } opts = simple_option(options) if not opts.options.jobid: logger.error("jobid is a required option. Bailing.") sys.exit(1) pquery = PBSQuery() current_job = pquery.getjob(opts.options.jobid) s = transform_info(current_job, opts.options.information) print "\n".join(s)
def main(): pq = PBSQuery() nodedict = pq.getnodes() for nodename, node in sorted(nodedict.iteritems()): print nodename for k, v in node.iteritems(): print k, v
def getModelServers(): resultServers = [] pQuery = PBSQuery() try: servers = pQuery.get_serverinfo() for serverName, pbsServer in servers.items(): customServer = PBSServer(name=serverName) try: customServer.state = TorqueService._listToStr(pbsServer[pbs.ATTR_status], '|') except KeyError: pass try: customServer.total_jobs = TorqueService._listToInt(pbsServer[pbs.ATTR_total]) except KeyError: pass try: customServer.running_jobs = int(TorqueService._strToDict(pbsServer[pbs.ATTR_count][0])['Running']) except KeyError: pass try: customServer.queued_jobs = int(TorqueService._strToDict(pbsServer[pbs.ATTR_count][0])['Queued']) except KeyError: pass try: customServer.pbs_version = TorqueService._listToStr(pbsServer[pbs.ATTR_pbsversion], '|') except KeyError: pass resultServers.append(customServer) except PBSError as pbsErr: print(pbsErr) return resultServers
class PBSManager(): def __init__(self): self.p = PBSQuery() self.queue = self.p.getqueue(survey) self.jobs = self.p.getjobs() self.nodes = self.p.getnodes_with_property(use_ressources) self.queue_names = self.jobs.keys() #print self.nodes #print self.nodes['psr13']['ncpus'] def jobs_running(self): queue_status = self.queue['state_count'][0] Transit, Queued, Held, Waiting, Running, Exiting = queue_status.split() Queued = int(Queued.split(':')[1]) Running = int(Running.split(':')[1]) return Running, Queued def is_running(self, basefilename): jobnames = [] for jobs_name in self.queue_names: job_info = self.jobs[jobs_name] jobnames.append(job_info["Job_Name"][0]) return (basefilename in jobnames) def get_stderr_path(self, basefilename): stderr_path = os.path.join(logs_loc, "%s.err"%(basefilename)) if not os.path.exists(stderr_path): raise ValueError("Cannot find error log for job (%s): %s" % \ (basefilename, stderr_path)) return stderr_path def had_errors(self, basefilename): try: errorlog = self.get_stderr_path(basefilename) except ValueError: errors = False else: if os.path.getsize(errorlog) > 0: errors = True else: errors = False return errors def get_errors(self, basefilename): try: errorlog = self.get_stderr_path(basefilename) except ValueError, e: errors = str(e) else:
def main(): pq = PBSQuery() print 'Server:',pq.server serverinfo = pq.get_serverinfo() for k, v in sorted(serverinfo.iteritems()): print k for i, j in sorted(v.iteritems()): print i, j
def main(): print "CPU cluster usage :" p = PBSQuery() queues = p.getqueues() for queue in queues.keys(): np = countcpu(queue) npp = countppn(queue) print "%s : [ %s / %s ] " % (queue,npp,np)
def main(): print "Queue memory usage in Gb [ reserved / available ] : " p = PBSQuery() queues = p.getqueues() for queue in queues.keys(): np = countmem(queue) npp = countppn(queue) print "%s : [ %s / %s ] " % (queue, np, npp)
def main(): pq = PBSQuery() jobsdict = pq.getjobs() for jobid, job in sorted(jobsdict.iteritems()): print jobid for k, v in job.iteritems(): print " ", k, v print ""
def main(): pq = PBSQuery() queuedict = pq.getqueues() print queuedict for queuename, queue in sorted(queuedict.iteritems()): print queuename for k, v in queue.iteritems(): print k, v print
def getModelJobs(): """ 1. get jobs 2. get users 3. map each job to User and Queue 4. save all jobs """ resultJobs = [] pQuery = PBSQuery() try: jobs = pQuery.getjobs() for jobName, pbsJob in jobs.items(): customJob = PBSJob(jobId=jobName) try: customJob.name = TorqueService._listToStr(pbsJob[pbs.ATTR_name], '|') except KeyError: pass try: customJob.owner = TorqueService._listToStr(pbsJob[pbs.ATTR_owner], '|') except KeyError: pass try: customJob.state = TorqueService._listToStr(pbsJob[pbs.ATTR_state], '|') except KeyError: pass try: customJob.queue_raw = TorqueService._listToStr(pbsJob[pbs.ATTR_queue], '|') except KeyError: pass try: customJob.start_time = datetime.fromtimestamp(TorqueService._listToInt(pbsJob[pbs.ATTR_start_time])) except KeyError: pass try: customJob.resource_cput = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['cput'], '|') except KeyError: pass try: customJob.resource_mem = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['mem'], '|') except KeyError: pass try: customJob.resource_vmem = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['vmem'], '|') except KeyError: pass try: customJob.resource_walltime = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['walltime'], '|') except KeyError: pass resultJobs.append(customJob) except PBSError as pbsErr: print(pbsErr) return resultJobs
def print_header(): # try connecting to the PBS server print "Content-Type: text/html" try: pbs = PBSQuery(SERVER) jobs = pbs.getjobs() # ldin = ldap.open("192.168.0.90") except PBSError, error: print "<h1>Error connecting to PBS server:</h1><tt>",error,"</tt>" sys.exit(1)
def __init__(self, server_name): p = PBSQuery(str(server_name)) info = p.get_serverinfo().items()[0] self.name = info[0] self.p = p for k,v in info[1].items(): if k.startswith('resources'): for i,j in v.items(): setattr(self, k + '_' + i, j[0]) else: setattr(self, k, v[0])
def __init__(self, server_name): p = PBSQuery(str(server_name)) info = p.get_serverinfo().items()[0] self.name = info[0] self.p = p for k, v in info[1].items(): if k.startswith('resources'): for i, j in v.items(): setattr(self, k + '_' + i, j[0]) else: setattr(self, k, v[0])
def print_table_summary(): global PBS_STATES global OPT_SERVERNAME try: if not OPT_SERVERNAME: p = PBSQuery() else: p = PBSQuery(OPT_SERVERNAME) except PBSError, reason: _print('error: %s' % reason) sys.exit(-1)
def getData(): p = PBSQuery() nodes = p.getnodes() for node in nodes.keys(): host = TorqueHost(node) try: host.setState(nodes[node]['state']) except: pass try: host.setSlots(int(nodes[node]['np'][0])) except: pass try: host.setSlotsUsed(len(nodes[node]['jobs'])) except: host.setSlotsUsed(0) try: jobs = nodes[node]['status']['jobs'][0].split() host.setJobList(jobs) except: pass try: availmem = nodes[node]['status']['availmem'][0] host.setAvailMem(availmem) except: pass try: totalmem = nodes[node]['status']['totmem'][0] host.setTotalMem(totalmem) except: pass try: rectime = nodes[node]['status']['rectime'][0] host.setRecTime(rectime) except: pass try: loadave = nodes[node]['status']['loadave'][0] host.setLoadAve(loadave) except: pass try: netload = nodes[node]['status']['netload'][0] host.setNetLoad(netload) except: pass Hosts[host.Name] = host
def getData(): p=PBSQuery() nodes=p.getnodes() for node in nodes.keys(): host=TorqueHost(node) try: host.setState(nodes[node]['state']) except: pass try: host.setSlots(int(nodes[node]['np'][0])) except: pass try: host.setSlotsUsed(len(nodes[node]['jobs'])) except: host.setSlotsUsed(0) try: jobs=nodes[node]['status']['jobs'][0].split() host.setJobList(jobs) except: pass try: availmem=nodes[node]['status']['availmem'][0] host.setAvailMem(availmem) except: pass try: totalmem=nodes[node]['status']['totmem'][0] host.setTotalMem(totalmem) except: pass try: rectime=nodes[node]['status']['rectime'][0] host.setRecTime(rectime) except: pass try: loadave=nodes[node]['status']['loadave'][0] host.setLoadAve(loadave) except: pass try: netload=nodes[node]['status']['netload'][0] host.setNetLoad(netload) except: pass Hosts[host.Name] = host
def countcpu(queue): p = PBSQuery() p.new_data_structure() nodes = p.getnodes() nptot = 0 for id in nodes: if nodes[id].properties == [queue]: try: np = nodes[id].np np = int(np[0]) nptot = nptot + np except PBSError, detail: print detail pass
def getJobs(): resultJobs = [] p = PBSQuery() try: jobs = p.getjobs() for jobId, pbsJob in jobs.items(): customJob = Job(jobId) try: customJob.state = TorqueService._listToStr(pbsJob[pbs.ATTR_state], '|') except KeyError: pass try: customJob.user = TorqueService._listToStr(pbsJob[pbs.ATTR_owner], '|') except KeyError: pass try: customJob.queue = TorqueService._listToStr(pbsJob[pbs.ATTR_queue], '|') except KeyError: pass try: customJob.name = TorqueService._listToStr(pbsJob[pbs.ATTR_name], '|') except KeyError: pass try: customJob.cpu_time = TorqueService._listToStr(pbsJob[pbs.ATTR_l]['walltime'], '|') except KeyError: pass try: customJob.n_p = TorqueService._splitResourcesList(pbsJob[pbs.ATTR_l]['nodes']) except KeyError: pass try: customJob.setQueued(TorqueService._listToStr(pbsJob[pbs.ATTR_qtime], '|')) except KeyError: pass try: customJob.setStarted(TorqueService._listToStr(pbsJob[pbs.ATTR_start_time], '|')) except KeyError: pass try: customJob.running_time = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['walltime'], '|') except KeyError: pass resultJobs.append(customJob) except PBSError as pbsErr: print(pbsErr) return resultJobs
def main(args): """Main script.""" options = { 'nagios': ('print out nagion information', None, 'store_true', False, 'n'), 'nagios_check_filename': ('filename of where the nagios check data is stored', str, 'store', NAGIOS_CHECK_FILENAME), 'nagios_check_interval_threshold': ('threshold of nagios checks timing out', None, 'store', NAGIOS_CHECK_INTERVAL_THRESHOLD), 'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users', None, 'store_true', False), 'ha': ('high-availability master IP address', None, 'store', None), 'dry-run': ('do not make any updates whatsoever', None, 'store_true', False), } opts = simple_option(options) nagios_reporter = NagiosReporter(NAGIOS_HEADER, NAGIOS_CHECK_FILENAME, NAGIOS_CHECK_INTERVAL_THRESHOLD) if opts.options.nagios: nagios_reporter.report_and_exit() sys.exit(0) # not reached if not proceed_on_ha_service(opts.options.ha): logger.warning("Not running on the target host in the HA setup. Stopping.") nagios_reporter(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master.")) sys.exit(NAGIOS_EXIT_WARNING) try: vsc_config = VscConfiguration() LdapQuery(vsc_config) grace_users = get_user_with_status('grace') inactive_users = get_user_with_status('inactive') pbs_query = PBSQuery() t = time.ctime() jobs = pbs_query.getjobs() # we just get them all removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users, opts.options.dry_run) removed_running = remove_running_jobs(jobs, inactive_users, opts.options.dry_run) if opts.options.mail_report and not opts.options.dry_run: if len(removed_queued) > 0 or len(removed_running) > 0: mail_report(t, removed_queued, removed_running) except Exception, err: logger.exception("Something went wrong: {err}".format(err=err)) nagios_reporter.cache(NAGIOS_EXIT_CRITICAL, NagiosResult("Script failed, check log file ({logfile})".format(logfile=PBS_CHECK_LOG_FILE))) sys.exit(NAGIOS_EXIT_CRITICAL)
def get_ppn(self): """Guess the ppn for full node""" pq = PBSQuery() node_vals = pq.getnodes().values() # only the values, not the names interesni_nodes = ('free', 'job-exclusive',) res = {} for np in [int(x['np'][0]) for x in node_vals if x['state'][0] in interesni_nodes]: res.setdefault(np, 0) res[np] += 1 # # return most frequent freq_count, freq_np = max([(j, i) for i, j in res.items()]) self.log.debug("Found most frequent np %s (%s times) in interesni nodes %s" % (freq_np, freq_count, interesni_nodes)) return freq_np
def countmem(queue): p = PBSQuery() p.new_data_structure() jobs = p.getjobs() nptot = 0 for id in jobs: if jobs[id].queue == [queue]: try: np = jobs[id].Resource_List.mem np = np[0][:-2] np = int(np) nptot = nptot + np except PBSError, detail: print detail pass
def get_nodes(racknode=False, hosts=None): global LENGTH_NODE global LENGTH_STATE global OPT_SERVERNAME nodes_dict = dict() try: if not OPT_SERVERNAME: p = PBSQuery() else: p = PBSQuery(OPT_SERVERNAME) except PBSError, reason: _print('Error: %s' % reason) sys.exit(-1)
def __init__(self): self.p = PBSQuery() self.queue = self.p.getqueue(survey) self.jobs = self.p.getjobs() self.nodes = self.p.getnodes_with_property(use_ressources) self.queue_names = self.jobs.keys()
def __init__(self, server, verbosity=0, diagOutputFile=None): self.SRList = {} self.activeNodes = {} self.verbosity = verbosity self.server = server # Load Torque configuration try: torqueConfig=PBSQuery(server) self.server_info=torqueConfig.get_serverinfo()[server] self.nodes=torqueConfig.getnodes() self.queues=torqueConfig.getqueues() except PBSError, e: self.__debug(0,"Error connecting to PBS server: %s" % e) sys.exit(1)
def countppn(queue): p = PBSQuery() p.new_data_structure() nodes = p.getnodes() nptot = 0 for id in nodes: try: if nodes[id].properties[0] == queue: np = nodes[id].status.physmem[0] np = np[:-2] np = int(np) np = np / 1000000 nptot = np + nptot except PBSError, detail: print detail pass
def main(): p = PBSQuery() p.new_data_structure() jobs = p.getjobs() for id in jobs: try: np = jobs[id].Resource_List.mem nd = jobs[id].Resource_List.nodect nn = jobs[id].Resource_List nom = jobs[id].exec_host queue = jobs[id].queue #print nom,np,nd,queue,nn print nn except PBSError, detail: print detail pass
def main(): p = PBSQuery() p.new_data_structure() nodes = p.getnodes() for id in nodes: try: #np = nodes[id].status.physmem[0] #queue = nodes[id].properties[0] #print np,queue print nodes[id].status.loadave #print nodes[id].status.ncpus #if hasattr(nodes[id],"jobs"): # for job in nodes[id].jobs : # print job except PBSError, detail: print detail pass
def main(): """Main script.""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users', None, 'store_true', False), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), 'account_page_url': ('URL of the account page where we can find the REST API', None, 'store', None) } opts = ExtendedSimpleOption(options) try: now = datetime.datetime.utcnow() timestamp = now - datetime.timedelta(days=1) client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/") active_users, inactive_users = client.get_accounts() grace_users = [] for a in active_users: try: if a.expiry_date and datetime.datetime.strptime(a.expiry_date, "%Y-%m-%d") - now < datetime.timedelta(days=7): grace_users.append(a) except AttributeError as err: logger.debug("Account %s does not have expiry date", a.vsc_id) pbs_query = PBSQuery() t = time.ctime() jobs = pbs_query.getjobs() # we just get them all removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users) removed_running = remove_running_jobs(jobs, inactive_users) if opts.options.mail_report and not opts.options.dry_run: if len(removed_queued) > 0 or len(removed_running) > 0: mail_report(t, removed_queued, removed_running) except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def _get_ppn(self): """Guess PBS' `ppn` value for a full node.""" # cache this value as it's not likely going to change over the # `eb` script runtime ... if not self._ppn: pq = PBSQuery() node_vals = pq.getnodes().values() # only the values, not the names interesting_nodes = ('free', 'job-exclusive',) res = {} for np in [int(x['np'][0]) for x in node_vals if x['state'][0] in interesting_nodes]: res.setdefault(np, 0) res[np] += 1 # return most frequent freq_count, freq_np = max([(j, i) for i, j in res.items()]) self.log.debug("Found most frequent np %s (%s times) in interesting nodes %s" % (freq_np, freq_count, interesting_nodes)) self._ppn = freq_np return self._ppn
class PBSScheduler(AbstractClusterScheduler): def __init__(self, *args, **kwargs): self.pbsquery = PBSQuery() super(PBSScheduler, self).__init__(*args, **kwargs) def output_regexp(self): return r'(^\d+)' def submit_command(self, output_file, job_name): # Note for posterity: ssh bint01 "source BLAH && qsub BLAH" doesn't work #return """ssh bint01 "/usr/syscom/nsg/opt/torque/4.2.6/bin/qsub -S /bin/bash -N %s -e %s -o %s -j oe" """ % \ #(job_name, output_file, output_file) #return """qsub -S /bin/bash -N %s -e %s -o %s -j oe" """ % (job_name, output_file, output_file) return """qsub -N %s -e %s -o %s -j oe """ % (job_name, output_file, output_file) # ' '.join(['ssh', 'bint01', '"qsub', '-S', '/bin/bash', # '-N', "%s" % (job_name), # '-e', output_file, # '-o', output_file, # '-j', 'oe"', # ]) def alive(self, process_id): alive = False try: status = self.pbsquery.getjob(str(process_id))['job_state'][0] except: # job not found status = -1 sys.stderr.write("EXC: %s\n" % str(sys.exc_info()[0])) sys.stderr.write("Could not find job for process id %d\n" % process_id) if status == 'Q': sys.stderr.write("Job %d waiting in queue.\n" % (process_id)) alive = True elif status == 'R': sys.stderr.write("Job %d is running.\n" % (process_id)) alive = True elif status in ['H','S']: sys.stderr.write("Job %d is held or suspended.\n" % (process_id)) alive = False if not alive: try: # Kill the job. c = pbs.pbs_connect(pbs.pbs_default()) result = pbs.pbs_deljob(c, str(process_id)) sys.stderr.write("Killed job %d.\n" % (process_id)) except: sys.stderr.write("Failed to kill job %d.\n" % (process_id)) return False else: return True
class PBSScheduler(AbstractClusterScheduler): def __init__(self, *args, **kwargs): self.pbsquery = PBSQuery() super(PBSScheduler, self).__init__(*args, **kwargs) def output_regexp(self): return r'(^\d+)' def submit_command(self, output_file, job_name): # Note for posterity: ssh bint01 "source BLAH && qsub BLAH" doesn't work return """ssh bint01 "/usr/syscom/nsg/opt/torque/4.2.6/bin/qsub -S /bin/bash -N %s -e %s -o %s -j oe" """ % \ (job_name, output_file, output_file) # ' '.join(['ssh', 'bint01', '"qsub', '-S', '/bin/bash', # '-N', "%s" % (job_name), # '-e', output_file, # '-o', output_file, # '-j', 'oe"', # ]) def alive(self, process_id): alive = False try: status = self.pbsquery.getjob(str(process_id))['job_state'][0] except: # job not found status = -1 sys.stderr.write("EXC: %s\n" % str(sys.exc_info()[0])) sys.stderr.write("Could not find job for process id %d\n" % process_id) if status == 'Q': sys.stderr.write("Job %d waiting in queue.\n" % (process_id)) alive = True elif status == 'R': sys.stderr.write("Job %d is running.\n" % (process_id)) alive = True elif status in ['H', 'S']: sys.stderr.write("Job %d is held or suspended.\n" % (process_id)) alive = False if not alive: try: # Kill the job. c = pbs.pbs_connect(pbs.pbs_default()) result = pbs.pbs_deljob(c, str(process_id)) sys.stderr.write("Killed job %d.\n" % (process_id)) except: sys.stderr.write("Failed to kill job %d.\n" % (process_id)) return False else: return True
def getModelQueues(pbsServer): resultQueues = [] pQuery = PBSQuery() try: queues = pQuery.getqueues() for queueName, pbsQueue in queues.items(): customQueue = PBSQueue(server=pbsServer, name=queueName) try: customQueue.type = TorqueService._listToStr(pbsQueue['queue_type'], '|') except KeyError: pass try: customQueue.total_jobs = TorqueService._listToInt(pbsQueue[pbs.ATTR_total]) except KeyError: pass try: customQueue.running_jobs = int(TorqueService._strToDict(pbsQueue[pbs.ATTR_count][0])['Running']) except KeyError: pass try: customQueue.queued_jobs = int(TorqueService._strToDict(pbsQueue[pbs.ATTR_count][0])['Queued']) except KeyError: pass try: customQueue.resource_walltime = TorqueService._listToStr(pbsQueue[pbs.ATTR_rescdflt]['walltime'], '|') except KeyError: pass try: customQueue.resource_nodes = TorqueService._listToInt(pbsQueue[pbs.ATTR_rescdflt]['nodes']) except KeyError: pass resultQueues.append(customQueue) except PBSError as pbsErr: print(pbsErr) return resultQueues
def main(): p = PBSQuery() p.new_data_structure() nodes = p.getnodes() l=list() for id in nodes: try: queue = nodes[id].properties[0] state = nodes[id].state[0] power = nodes[id].power_state[0] np = nodes[id].np[0] name = nodes[id].name if hasattr(nodes[id],"jobs"): jobs = nodes[id].jobs[0].split('/') if len(jobs) > 1: jobs = jobs[1] else: jobs = jobs[0] else: jobs = "none" l.append([name,state,power,queue,np,jobs]) except PBSError, detail: print detail pass
def main(): """ Main script. """ options = { "jobid": ("The PBS_JOBID of the job for which we want information", None, "store", None), "information": ("Comma-separated list of the job info to print. " "Entries of the format input_key:output_key", None, "store", None), } opts = simple_option(options) if not opts.options.jobid: logger.error("jobid is a required option. Bailing.") sys.exit(1) pquery = PBSQuery() current_job = pquery.getjob(opts.options.jobid) s = transform_info(current_job, opts.options.information) print "\n".join(s)
def __init__(self, pbs_server=None, debug_p=False): """Creates a QstatViewer object. Arguments: - pbs_server : FQDN of the TORQUE server to query (string)""" self.debug_p = debug_p self.nodes = {} self.jobs = {} self.queues = {} self.pbsquery = PBSQuery(pbs_server) self.servername = self.pbsquery.get_server_name() self.__make_server() self.__make_queues() self.__make_jobs() self.__make_nodes()
def __init__(self, *args, **kwargs): self.pbsquery = PBSQuery() super(PBSScheduler, self).__init__(*args, **kwargs)
for p in ['started', 'enabled', 'acl_group_enable']: if p in pbsqueue_dict: if pbsqueue_dict[p][0] == 'True': self.__dict__[p] = True else: self.__dict__[p] = False if 'state_count' in pbsqueue_dict: self.state_count = {} state_counts = pbsqueue_dict['state_count'][0].strip().split(' ') for s in state_counts: name, count = s.split(':') self.state_count[name] = int(count) def __str__(self): return str(self.__dict__) if __name__ == '__main__': pq = PBSQuery() queues = {} for k,v in sorted(pq.getqueues().iteritems()): queues[k] = Queue(name=k, pbsqueue_dict=v) for queuename, queue in sorted(queues.iteritems()): print queuename, ':', queue
#!/usr/bin/env python # # Author: Bas van der Vlies <*****@*****.**> # Date : 17 Aug 2001 # Desc. : Simple pbsnodes -a # # SVN info: # $Id: node_event_parse.py 287 2012-12-20 11:03:07Z bas $ # $URL: https://oss.trac.surfsara.nl/pbs_python/svn/tags/4.6.0/examples/node_event_parse.py $ # # # import pbs import sys from PBSQuery import PBSQuery p = PBSQuery() node = p.getnode('apccl02.in2p3.fr') print node['event']
class QstatViewer: """ Presents a nicer (?) interface to PBSQuery The two main member objects are: * jobs -- a dictionary with job ID (as str) as the key, and the corresponding Job object as the value * nodes -- a dictionary with node name as the key, and a set of corresponding job IDs (of jobs running on node) """ def __init__(self, pbs_server=None, debug_p=False): """Creates a QstatViewer object. Arguments: - pbs_server : FQDN of the TORQUE server to query (string)""" self.debug_p = debug_p self.nodes = {} self.jobs = {} self.queues = {} self.pbsquery = PBSQuery(pbs_server) self.servername = self.pbsquery.get_server_name() self.__make_server() self.__make_queues() self.__make_jobs() self.__make_nodes() def __make_nodes(self): """Make dict with node names as keys, and list of job objects as values""" # make list of jobids running on the node #node_jobs = {} #for jobid,job in self.jobs.iteritems(): # if job.exec_host: # for node_cpu in job.exec_host: # node = node_cpu.split('/')[0] # if node not in node_jobs: # node_jobs[node] = [] # else: # node_jobs[node].append(jobid) rawnodes = self.pbsquery.getnodes() for n,s in rawnodes.iteritems(): self.nodes[n] = Node(name=n, pbsnodes_dict=dict(s), debug_p=self.debug_p) def __make_jobs(self): """Make dict with job IDs as keys, and job properties as values""" rawjobs = self.pbsquery.getjobs() for j,p in rawjobs.iteritems(): self.jobs[j] = Job(id=j, pbsjobs_dict=dict(p), debug_p=self.debug_p) def __make_queues(self): """make dict with queue names as keys, and queue properties as values""" rawqueues = self.pbsquery.getqueues() for q,p in rawqueues.iteritems(): self.queues[q] = Queue(name=q, pbsqueue_dict=p) def __make_server(self): self.__serverinfo = self.pbsquery.get_serverinfo()[self.servername] if self.debug_p: print 'FOOBAR: self.serverinfo =', self.__serverinfo for k,v in self.__serverinfo.iteritems(): self.__dict__[k] = None if k == 'state_count': # Example of state_count: Transit:0 Queued:-6458 Held:6383 Waiting:0 Running:964 Exiting:0 self.__dict__[k] = {} vals = v[0].strip().split(' ') for state in vals: statename = state.split(':')[0] stateval = int(state.split(':')[1]) self.__dict__[k][statename] = stateval elif k == 'resources_default': v['mem'] = Memory(v['mem'][0]) v['pmem'] = Memory(v['pmem'][0]) v['cput'] = pbstimestr_to_timedelta(v['cput'][0]) v['walltime'] = pbstimestr_to_timedelta(v['walltime'][0]) self.__dict__[k] = v elif k == 'resources_assigned': if 'mem' in v: v['mem'] = Memory(v['mem'][0]) if 'vmem' in v: v['vmem'] = Memory(v['vmem'][0]) if 'ncpus' in v: v['ncpus'] = int(v['ncpus'][0]) if 'nodect' in v: v['nodect'] = int(v['nodect'][0]) self.__dict__[k] = v elif k == 'scheduling' or k == 'query_other_jobs': if v[0] == 'True': v[0] = True elif v[0] == 'False': v[0] = False self.__dict__[k] = v[0] elif k == 'scheduler_iteration': self.__dict__[k] = datetime.timedelta(seconds=int(v[0])) elif k == 'next_job_number' or k == 'node_check_rate' or k == 'tcp_timeout' or k == 'total_jobs': self.__dict__[k] = int(v[0]) elif len(v) == 1: self.__dict__[k] = v[0] else: self.__dict__[k] = v def get_job(self, jobid): """Queries the queue for jobid""" j = self.pbsquery.getjob(jobid) if self.debug_p: print 'ALOHA: ', print j.__dict__['data'] if 'data' in j.__dict__: return Job(id=jobid, pbsjobs_dict=dict(j), debug_p=self.debug_p) else: return None def jobs_by_user(self, username=None): """Returns a dict of jobs (keyed by jobid) belonging to username""" retval = {} if not username: retval = None else: for jobid,job in self.jobs.iteritems(): if job.owner == username: retval[jobid] = job return retval def nodes_with_property(self, prop): """Returns a dict of nodes (keyed by nodename) having the given property string""" retval = {} if prop: for nodename,node in self.nodes.iteritems(): if prop in node.properties: retval[nodename] = node else: retval = self.nodes return retval def nodes_in_clan(self, clan): """Returns a dict of nodes (keyed by nodename) belonging to the given clan""" retval = {} if clan: for nodename,node in self.nodes.iteritems(): if clan == node.clan: retval[nodename] = node else: retval = self.nodes return retval def __unicode__(self): if self.debug_p: print 'FOOBAR: type(self.jobs) =', type(self.jobs) print 'FOOBAR: self.jobs =', self.jobs job_dict_list = [] for k,v in self.jobs.iteritems(): job_dict_list.append(str(v)) return str(job_dict_list) def __str__(self): return self.__unicode__()
# get a handle conn = pbs.pbs_connect(pbs_server) # queue it if os.access(job_file, os.R_OK): log.debug( "submitting file %s with output %s and error %s" % (job_file, ofile, efile)) log.debug("command is: %s" % command_line) job_id = pbs.pbs_submit(conn, job_attrs, job_file, None, None) # monitor if job_id: p = PBSQuery() job_data = p.getjob(job_id) old_state = job_data[job_id]["job_state"] log.debug("initial state is %s" % old_state) running = False while True: job_data = p.getjob(job_id) if not job_data: break state = job_data[job_id]["job_state"] if state != old_state: log.debug("job state changed from %s to %s" % (old_state, state)) if state == "R" and not running: running = True for data in out_data.values():
#!/usr/bin/python from PBSQuery import PBSQuery, PBSError server = 'grid63.lal.in2p3.fr' try: p=PBSQuery(server) pbs=p.get_serverinfo() nodes=p.getnodes() jobs=p.getjobs() queues=p.getqueues() except PBSError, e: print "<h3>Error connecting to PBS server:</h3><tt>",e,"</tt>" sys.exit(1) print "" print "Server info:" print pbs print "" print "Nodes:" for node in nodes.keys(): print "********** %s *********" % node print nodes[node] print "" print "Jobs:" for job in jobs.keys(): print "********** Job %s *********" % job print jobs[job]