def main(args):
    """Main script."""

    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users',
                        None, 'store_true', False),
    }
    opts = ExtendedSimpleOption(options)

    try:
        vsc_config = VscConfiguration()
        LdapQuery(vsc_config)

        grace_users = get_user_with_status('grace')
        inactive_users = get_user_with_status('inactive')

        pbs_query = PBSQuery()

        t = time.ctime()
        jobs = pbs_query.getjobs()  # we just get them all

        removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users, opts.options.dry_run)
        removed_running = remove_running_jobs(jobs, inactive_users, opts.options.dry_run)

        if opts.options.mail_report and not opts.options.dry_run:
            if len(removed_queued) > 0 or len(removed_running) > 0:
                mail_report(t, removed_queued, removed_running)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
def main():

  p = PBSQuery()
  p.new_data_structure()

  #job = p.getjob('2983215')
  #print job['substate']
  #print job.substate
  #print job.queue
  #print job.Resource_List
  #print job.Resource_List.nodes 
  #print job.Resource_List.arch 
  #print job.Variable_List.PBS_O_HOME

  l = ['np', 'state' ]
  node = p.getnode("gb-r5n1", l)
  print node.name, node['np']

  sys.exit(0)

  #nodes = p.getnodes(l)
  for id in nodes:
	print id

	try:
		print nodes[id].np
		print nodes[id].status.arch
		print nodes[id].status.uname
		print nodes[id].state
	except PBSError, detail:
		print detail
		pass
    def _get_ppn(self):
        """Guess PBS' `ppn` value for a full node."""
        # cache this value as it's not likely going to change over the
        # `eb` script runtime ...
        if not self._ppn:
            pq = PBSQuery()
            node_vals = pq.getnodes().values(
            )  # only the values, not the names
            interesting_nodes = (
                'free',
                'job-exclusive',
            )
            res = {}
            for np in [
                    int(x['np'][0]) for x in node_vals
                    if x['state'][0] in interesting_nodes
            ]:
                res.setdefault(np, 0)
                res[np] += 1

            if not res:
                raise EasyBuildError(
                    "Could not guess the ppn value of a full node because " +
                    "there are no free or job-exclusive nodes.")

            # return most frequent
            freq_count, freq_np = max([(j, i) for i, j in res.items()])
            self.log.debug(
                "Found most frequent np %s (%s times) in interesting nodes %s"
                % (freq_np, freq_count, interesting_nodes))

            self._ppn = freq_np

        return self._ppn
Exemple #4
0
def main():

    p = PBSQuery()
    p.new_data_structure()

    #job = p.getjob('2983215')
    #print job['substate']
    #print job.substate
    #print job.queue
    #print job.Resource_List
    #print job.Resource_List.nodes
    #print job.Resource_List.arch
    #print job.Variable_List.PBS_O_HOME

    l = ['np', 'state']
    #node = p.getnode("r2n2", l)

    #print node.name
    #print node.name, node['np']

    #sys.exit(0)

    nodes = p.getnodes(l)
    for id in nodes:
        print id

        try:
            print nodes[id].np
            #print nodes[id].status.arch
            print nodes[id].status.uname
            print nodes[id].state
        except PBSError, detail:
            print detail
            pass
def get_ppn():
    """Guess the ppn for full node"""

    log = fancylogger.getLogger('pbs_job.get_ppn')

    pq = PBSQuery()
    node_vals = pq.getnodes().values()  # only the values, not the names
    interesting_nodes = (
        'free',
        'job-exclusive',
    )
    res = {}
    for np in [
            int(x['np'][0]) for x in node_vals
            if x['state'][0] in interesting_nodes
    ]:
        res.setdefault(np, 0)
        res[np] += 1

    # return most frequent
    freq_count, freq_np = max([(j, i) for i, j in res.items()])
    log.debug("Found most frequent np %s (%s times) in interesting nodes %s" %
              (freq_np, freq_count, interesting_nodes))

    return freq_np
Exemple #6
0
def main():
    p = PBSQuery()
    p.new_data_structure()
    nodes = p.getnodes()
    l=list()
    jobs = "none"
    for id in nodes:
        try:
            queue = nodes[id].properties[0]
            state = nodes[id].state[0]
            power = nodes[id].power_state[0]
            np = nodes[id].np[0]
            name = nodes[id].name
            memory = nodes[id].status.physmem[0]
            memory = memory[:-2]
            memory = int(memory)
            memory = memory/1000000
            load = nodes[id].status.loadave[0]
            display = " "
            if hasattr(nodes[id],"jobs"):
                jobs = nodes[id].jobs
                results = len(jobs)
                for result in range(results):
                    display += "x"
                #result = str()
                #s = ", "
                #result = s.join(jobs)
            else:
                display = "0"
            l.append([name,state,power,queue,np,memory,load,display])
        except PBSError, detail:
            print detail
        pass
Exemple #7
0
def countppn(queue):
    p = PBSQuery()
    p.new_data_structure()
    jobs = p.getjobs()
    nptot = 0
    for id in jobs:
        try:
            if jobs[id].queue[0] == queue and jobs[id].job_state[0] == 'R':
                np = jobs[id].Resource_List.nodes
                if 'ppn' not in np[0]:
                    np = 1
                else:
                    npptot = 0
		    ct = [m.start() for m in re.finditer('ppn', np[0])]
                    for val in ct:
                        char = np[0]
                        vals = val+4
                        valf = val+6
                        npp = char[vals:valf]
			npp = re.sub('[!@#+:$]', '', npp)
                        npp = int(npp)
			npptot = npp + npptot
                    np = npptot
                nptot = np + nptot
        except PBSError, detail:
            print detail
        pass
Exemple #8
0
def main():
    """Main script."""

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'mail-report':
        ('mail a report to the hpc-admin list with job list for gracing or inactive users',
         None, 'store_true', False),
    }
    opts = ExtendedSimpleOption(options)

    try:
        vsc_config = VscConfiguration(VSC_CONF_DEFAULT_FILENAME)
        LdapQuery(vsc_config)

        grace_users = get_user_with_status('grace')
        inactive_users = get_user_with_status('inactive')

        pbs_query = PBSQuery()

        t = time.ctime()
        jobs = pbs_query.getjobs()  # we just get them all

        removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users)
        removed_running = remove_running_jobs(jobs, inactive_users)

        if opts.options.mail_report and not opts.options.dry_run:
            if len(removed_queued) > 0 or len(removed_running) > 0:
                mail_report(t, removed_queued, removed_running)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Exemple #9
0
def pbs_handler(name):
    pbs = PBSQuery()
    jobs = pbs.getjobs()
    for jobid, jobinfo in jobs.iteritems():
        publish(jobid, jobinfo)
    publish_queue_state(pbs)
    return ""
Exemple #10
0
def main():
    """
    Main script.
    """

    options = {
        "jobid": ("The PBS_JOBID of the job for which we want information", None, "store", None),
        "information": (
            "Comma-separated list of the job info to print. " "Entries of the format input_key:output_key",
            None,
            "store",
            None,
        ),
    }
    opts = simple_option(options)

    if not opts.options.jobid:
        logger.error("jobid is a required option. Bailing.")
        sys.exit(1)

    pquery = PBSQuery()
    current_job = pquery.getjob(opts.options.jobid)

    s = transform_info(current_job, opts.options.information)

    print "\n".join(s)
Exemple #11
0
def main():
    pq = PBSQuery()
    nodedict = pq.getnodes()
    for nodename, node in sorted(nodedict.iteritems()):
        print nodename
        for k, v in node.iteritems():
            print k, v
Exemple #12
0
def countppn(queue):
    p = PBSQuery()
    p.new_data_structure()
    jobs = p.getjobs()
    nptot = 0
    for id in jobs:
        try:
            if jobs[id].queue[0] == queue and jobs[id].job_state[0] == 'R':
                np = jobs[id].Resource_List.nodes
                if 'ppn' not in np[0]:
                    np = 1
                else:
                    npptot = 0
		    ct = [m.start() for m in re.finditer('ppn', np[0])]
                    for val in ct:
                        char = np[0]
                        vals = val+4
                        valf = val+6
                        npp = char[vals:valf]
			npp = re.sub('[!@#+:$]', '', npp)
                        npp = int(npp)
			npptot = npp + npptot
                    np = npptot
                nptot = np + nptot
        except PBSError, detail:
            print detail
        pass
Exemple #13
0
    def getModelServers():
        resultServers = []
        pQuery = PBSQuery()

        try:
            servers = pQuery.get_serverinfo()
            for serverName, pbsServer in servers.items():
                customServer = PBSServer(name=serverName)
                try:
                    customServer.state = TorqueService._listToStr(pbsServer[pbs.ATTR_status], '|')
                except KeyError:
                    pass
                try:
                    customServer.total_jobs = TorqueService._listToInt(pbsServer[pbs.ATTR_total])
                except KeyError:
                    pass
                try:
                    customServer.running_jobs = int(TorqueService._strToDict(pbsServer[pbs.ATTR_count][0])['Running'])
                except KeyError:
                    pass
                try:
                    customServer.queued_jobs = int(TorqueService._strToDict(pbsServer[pbs.ATTR_count][0])['Queued'])
                except KeyError:
                    pass
                try:
                    customServer.pbs_version = TorqueService._listToStr(pbsServer[pbs.ATTR_pbsversion], '|')
                except KeyError:
                    pass

                resultServers.append(customServer)
        except PBSError as pbsErr:
            print(pbsErr)

        return resultServers
Exemple #14
0
class PBSManager():

	def __init__(self):
	    self.p = PBSQuery()
	    self.queue = self.p.getqueue(survey)
	    self.jobs = self.p.getjobs()
	    self.nodes = self.p.getnodes_with_property(use_ressources)

	    self.queue_names = self.jobs.keys()

	    #print self.nodes
	    #print self.nodes['psr13']['ncpus']

	def jobs_running(self):

	    queue_status = self.queue['state_count'][0]

	    Transit, Queued, Held, Waiting, Running, Exiting = queue_status.split()

	    Queued = int(Queued.split(':')[1])
	    Running = int(Running.split(':')[1])
	    return Running, Queued


	def is_running(self, basefilename):

	    jobnames = []
	    for jobs_name in self.queue_names: 
	        job_info = self.jobs[jobs_name]
		jobnames.append(job_info["Job_Name"][0])

	    return (basefilename in jobnames)


	def get_stderr_path(self, basefilename):
	    stderr_path = os.path.join(logs_loc, "%s.err"%(basefilename))
	    if not os.path.exists(stderr_path):
		raise ValueError("Cannot find error log for job (%s): %s" % \
				(basefilename, stderr_path))
	    return stderr_path

	def had_errors(self, basefilename):    
	    try:
                errorlog = self.get_stderr_path(basefilename)
	    except ValueError:
	        errors = False
	    else:
	        if os.path.getsize(errorlog) > 0:
	            errors = True
		else:
		    errors = False
	    return errors

	def get_errors(self, basefilename):    
	    try:
		errorlog = self.get_stderr_path(basefilename)
	    except ValueError, e:
	        errors = str(e)
	    else:
def main():
    pq = PBSQuery()
    print 'Server:',pq.server
    serverinfo = pq.get_serverinfo()
    for k, v in sorted(serverinfo.iteritems()):
        print k 
        for i, j in sorted(v.iteritems()):
            print i, j
Exemple #16
0
def main():
    print "CPU cluster usage :"
    p = PBSQuery()
    queues = p.getqueues()
    for queue in queues.keys():
        np = countcpu(queue)
        npp = countppn(queue)
        print "%s : [ %s / %s ] " % (queue,npp,np)
Exemple #17
0
def main():
    print "Queue memory usage in Gb [ reserved / available ] : "
    p = PBSQuery()
    queues = p.getqueues()
    for queue in queues.keys():
        np = countmem(queue)
        npp = countppn(queue)
        print "%s : [ %s / %s ] " % (queue, np, npp)
Exemple #18
0
def main():
    pq = PBSQuery()
    jobsdict = pq.getjobs()
    for jobid, job in sorted(jobsdict.iteritems()):
        print jobid
        for k, v in job.iteritems():
            print "    ", k, v
        print ""
def main():
    pq = PBSQuery()
    queuedict = pq.getqueues()
    print queuedict
    for queuename, queue in sorted(queuedict.iteritems()):
        print queuename
        for k, v in queue.iteritems():
            print k, v
        print
Exemple #20
0
    def getModelJobs():
        """
        1. get jobs
        2. get users
        3. map each job to User and Queue
        4. save all jobs
        """
        resultJobs = []
        pQuery = PBSQuery()
        try:
            jobs = pQuery.getjobs()
            for jobName, pbsJob in jobs.items():
                customJob = PBSJob(jobId=jobName)
                try:
                    customJob.name = TorqueService._listToStr(pbsJob[pbs.ATTR_name], '|')
                except KeyError:
                    pass
                try:
                    customJob.owner = TorqueService._listToStr(pbsJob[pbs.ATTR_owner], '|')
                except KeyError:
                    pass
                try:
                    customJob.state = TorqueService._listToStr(pbsJob[pbs.ATTR_state], '|')
                except KeyError:
                    pass
                try:
                    customJob.queue_raw = TorqueService._listToStr(pbsJob[pbs.ATTR_queue], '|')
                except KeyError:
                    pass
                try:
                    customJob.start_time = datetime.fromtimestamp(TorqueService._listToInt(pbsJob[pbs.ATTR_start_time]))
                except KeyError:
                    pass
                try:
                    customJob.resource_cput = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['cput'], '|')
                except KeyError:
                    pass
                try:
                    customJob.resource_mem = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['mem'], '|')
                except KeyError:
                    pass
                try:
                    customJob.resource_vmem = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['vmem'], '|')
                except KeyError:
                    pass
                try:
                    customJob.resource_walltime = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['walltime'], '|')
                except KeyError:
                    pass

                resultJobs.append(customJob)
        except PBSError as pbsErr:
            print(pbsErr)

        return resultJobs
Exemple #21
0
def print_header():
    # try connecting to the PBS server
    print "Content-Type: text/html"

    try:
        pbs  = PBSQuery(SERVER)
        jobs = pbs.getjobs()
#	ldin = ldap.open("192.168.0.90")
    except PBSError, error:
        print "<h1>Error connecting to PBS server:</h1><tt>",error,"</tt>"
        sys.exit(1)
Exemple #22
0
 def __init__(self, server_name):
     p = PBSQuery(str(server_name))
     info = p.get_serverinfo().items()[0]
     self.name = info[0]
     self.p = p
     for k,v in info[1].items():
         if k.startswith('resources'):
             for i,j in v.items():
                 setattr(self, k + '_' + i, j[0])
         else:
             setattr(self, k, v[0]) 
Exemple #23
0
 def __init__(self, server_name):
     p = PBSQuery(str(server_name))
     info = p.get_serverinfo().items()[0]
     self.name = info[0]
     self.p = p
     for k, v in info[1].items():
         if k.startswith('resources'):
             for i, j in v.items():
                 setattr(self, k + '_' + i, j[0])
         else:
             setattr(self, k, v[0])
Exemple #24
0
def print_table_summary():
    global PBS_STATES
    global OPT_SERVERNAME

    try:
        if not OPT_SERVERNAME:
            p = PBSQuery()
        else:
            p = PBSQuery(OPT_SERVERNAME)
    except PBSError, reason:
        _print('error: %s' % reason)
        sys.exit(-1)
def getData():
    p = PBSQuery()
    nodes = p.getnodes()
    for node in nodes.keys():
        host = TorqueHost(node)
        try:
            host.setState(nodes[node]['state'])
        except:
            pass

        try:
            host.setSlots(int(nodes[node]['np'][0]))
        except:
            pass

        try:
            host.setSlotsUsed(len(nodes[node]['jobs']))
        except:
            host.setSlotsUsed(0)

        try:
            jobs = nodes[node]['status']['jobs'][0].split()
            host.setJobList(jobs)
        except:
            pass
        try:
            availmem = nodes[node]['status']['availmem'][0]
            host.setAvailMem(availmem)
        except:
            pass
        try:
            totalmem = nodes[node]['status']['totmem'][0]
            host.setTotalMem(totalmem)
        except:
            pass
        try:
            rectime = nodes[node]['status']['rectime'][0]
            host.setRecTime(rectime)
        except:
            pass
        try:
            loadave = nodes[node]['status']['loadave'][0]
            host.setLoadAve(loadave)
        except:
            pass
        try:
            netload = nodes[node]['status']['netload'][0]
            host.setNetLoad(netload)
        except:
            pass

        Hosts[host.Name] = host
def getData():
    p=PBSQuery()
    nodes=p.getnodes()
    for node in nodes.keys():
        host=TorqueHost(node)
        try:
            host.setState(nodes[node]['state'])
        except:
            pass

        try:
            host.setSlots(int(nodes[node]['np'][0]))
        except:
            pass

        try:
            host.setSlotsUsed(len(nodes[node]['jobs']))
        except:
            host.setSlotsUsed(0)

        try:
            jobs=nodes[node]['status']['jobs'][0].split()
            host.setJobList(jobs)
        except:
            pass
        try:
            availmem=nodes[node]['status']['availmem'][0]
            host.setAvailMem(availmem)
        except:
            pass
        try:
            totalmem=nodes[node]['status']['totmem'][0]
            host.setTotalMem(totalmem)
        except:
            pass
        try:
            rectime=nodes[node]['status']['rectime'][0]
            host.setRecTime(rectime)
        except:
            pass
        try:
            loadave=nodes[node]['status']['loadave'][0]
            host.setLoadAve(loadave)
        except:
            pass
        try:
            netload=nodes[node]['status']['netload'][0]
            host.setNetLoad(netload)
        except:
            pass

        Hosts[host.Name] = host
Exemple #27
0
def countcpu(queue):
    p = PBSQuery()
    p.new_data_structure()
    nodes = p.getnodes()
    nptot = 0
    for id in nodes:
        if nodes[id].properties == [queue]:
            try:
                np = nodes[id].np
                np = int(np[0])
                nptot = nptot + np
            except PBSError, detail:
                print detail
            pass
Exemple #28
0
def countcpu(queue):
    p = PBSQuery()
    p.new_data_structure()
    nodes = p.getnodes()
    nptot = 0
    for id in nodes:
        if nodes[id].properties == [queue]:
            try:
                np = nodes[id].np
                np = int(np[0])
                nptot = nptot + np
            except PBSError, detail:
                print detail
            pass
Exemple #29
0
    def getJobs():
        resultJobs = []
        p = PBSQuery()

        try:
            jobs = p.getjobs()
            for jobId, pbsJob in jobs.items():
                customJob = Job(jobId)
                try:
                    customJob.state = TorqueService._listToStr(pbsJob[pbs.ATTR_state], '|')
                except KeyError:
                    pass
                try:
                    customJob.user = TorqueService._listToStr(pbsJob[pbs.ATTR_owner], '|')
                except KeyError:
                    pass
                try:
                    customJob.queue = TorqueService._listToStr(pbsJob[pbs.ATTR_queue], '|')
                except KeyError:
                    pass
                try:
                    customJob.name = TorqueService._listToStr(pbsJob[pbs.ATTR_name], '|')
                except KeyError:
                    pass
                try:
                    customJob.cpu_time = TorqueService._listToStr(pbsJob[pbs.ATTR_l]['walltime'], '|')
                except KeyError:
                    pass
                try:
                    customJob.n_p = TorqueService._splitResourcesList(pbsJob[pbs.ATTR_l]['nodes'])
                except KeyError:
                    pass
                try:
                    customJob.setQueued(TorqueService._listToStr(pbsJob[pbs.ATTR_qtime], '|'))
                except KeyError:
                    pass
                try:
                    customJob.setStarted(TorqueService._listToStr(pbsJob[pbs.ATTR_start_time], '|'))
                except KeyError:
                    pass
                try:
                    customJob.running_time = TorqueService._listToStr(pbsJob[pbs.ATTR_used]['walltime'], '|')
                except KeyError:
                    pass

                resultJobs.append(customJob)
        except PBSError as pbsErr:
            print(pbsErr)

        return resultJobs
def main(args):
    """Main script."""

    options = {
        'nagios': ('print out nagion information', None, 'store_true', False, 'n'),
        'nagios_check_filename': ('filename of where the nagios check data is stored', str, 'store', NAGIOS_CHECK_FILENAME),
        'nagios_check_interval_threshold': ('threshold of nagios checks timing out', None, 'store', NAGIOS_CHECK_INTERVAL_THRESHOLD),
        'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users',
                        None, 'store_true', False),
        'ha': ('high-availability master IP address', None, 'store', None),
        'dry-run': ('do not make any updates whatsoever', None, 'store_true', False),
    }
    opts = simple_option(options)

    nagios_reporter = NagiosReporter(NAGIOS_HEADER, NAGIOS_CHECK_FILENAME, NAGIOS_CHECK_INTERVAL_THRESHOLD)

    if opts.options.nagios:
        nagios_reporter.report_and_exit()
        sys.exit(0)  # not reached

    if not proceed_on_ha_service(opts.options.ha):
        logger.warning("Not running on the target host in the HA setup. Stopping.")
        nagios_reporter(NAGIOS_EXIT_WARNING,
                        NagiosResult("Not running on the HA master."))
        sys.exit(NAGIOS_EXIT_WARNING)

    try:
        vsc_config = VscConfiguration()
        LdapQuery(vsc_config)

        grace_users = get_user_with_status('grace')
        inactive_users = get_user_with_status('inactive')

        pbs_query = PBSQuery()

        t = time.ctime()
        jobs = pbs_query.getjobs()  # we just get them all

        removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users, opts.options.dry_run)
        removed_running = remove_running_jobs(jobs, inactive_users, opts.options.dry_run)

        if opts.options.mail_report and not opts.options.dry_run:
            if len(removed_queued) > 0 or len(removed_running) > 0:
                mail_report(t, removed_queued, removed_running)
    except Exception, err:
        logger.exception("Something went wrong: {err}".format(err=err))
        nagios_reporter.cache(NAGIOS_EXIT_CRITICAL,
                              NagiosResult("Script failed, check log file ({logfile})".format(logfile=PBS_CHECK_LOG_FILE)))
        sys.exit(NAGIOS_EXIT_CRITICAL)
Exemple #31
0
    def get_ppn(self):
        """Guess the ppn for full node"""
        pq = PBSQuery()
        node_vals = pq.getnodes().values()  # only the values, not the names
        interesni_nodes = ('free', 'job-exclusive',)
        res = {}
        for np in [int(x['np'][0]) for x in node_vals if x['state'][0] in interesni_nodes]:
            res.setdefault(np, 0)
            res[np] += 1

        # # return most frequent
        freq_count, freq_np = max([(j, i) for i, j in res.items()])
        self.log.debug("Found most frequent np %s (%s times) in interesni nodes %s" % (freq_np, freq_count, interesni_nodes))

        return freq_np
Exemple #32
0
def countmem(queue):
    p = PBSQuery()
    p.new_data_structure()
    jobs = p.getjobs()
    nptot = 0
    for id in jobs:
        if jobs[id].queue == [queue]:
            try:
                np = jobs[id].Resource_List.mem
                np = np[0][:-2]
                np = int(np)
                nptot = nptot + np
            except PBSError, detail:
                print detail
            pass
Exemple #33
0
def get_nodes(racknode=False, hosts=None):
    global LENGTH_NODE
    global LENGTH_STATE
    global OPT_SERVERNAME

    nodes_dict = dict()

    try:
        if not OPT_SERVERNAME:
            p = PBSQuery()
        else:
            p = PBSQuery(OPT_SERVERNAME)
    except PBSError, reason:
        _print('Error: %s' % reason)
        sys.exit(-1)
Exemple #34
0
	def __init__(self):
	    self.p = PBSQuery()
	    self.queue = self.p.getqueue(survey)
	    self.jobs = self.p.getjobs()
	    self.nodes = self.p.getnodes_with_property(use_ressources)

	    self.queue_names = self.jobs.keys()
Exemple #35
0
  def __init__(self, server, verbosity=0, diagOutputFile=None):
    self.SRList = {}
    self.activeNodes = {}
    self.verbosity = verbosity
    self.server = server

    # Load Torque configuration
    
    try:
        torqueConfig=PBSQuery(server)
        self.server_info=torqueConfig.get_serverinfo()[server]
        self.nodes=torqueConfig.getnodes()
        self.queues=torqueConfig.getqueues()
    except PBSError, e:
        self.__debug(0,"Error connecting to PBS server: %s" % e)
        sys.exit(1)
  def __init__(self, server, verbosity=0, diagOutputFile=None):
    self.SRList = {}
    self.activeNodes = {}
    self.verbosity = verbosity
    self.server = server

    # Load Torque configuration
    
    try:
        torqueConfig=PBSQuery(server)
        self.server_info=torqueConfig.get_serverinfo()[server]
        self.nodes=torqueConfig.getnodes()
        self.queues=torqueConfig.getqueues()
    except PBSError, e:
        self.__debug(0,"Error connecting to PBS server: %s" % e)
        sys.exit(1)
Exemple #37
0
def countppn(queue):
    p = PBSQuery()
    p.new_data_structure()
    nodes = p.getnodes()
    nptot = 0
    for id in nodes:
        try:
            if nodes[id].properties[0] == queue:
                np = nodes[id].status.physmem[0]
                np = np[:-2]
                np = int(np)
                np = np / 1000000
                nptot = np + nptot
        except PBSError, detail:
            print detail
        pass
Exemple #38
0
def main():

    p = PBSQuery()
    p.new_data_structure()
    jobs = p.getjobs()

    for id in jobs:
        try:
            np = jobs[id].Resource_List.mem
            nd = jobs[id].Resource_List.nodect
            nn = jobs[id].Resource_List
            nom = jobs[id].exec_host
            queue = jobs[id].queue
            #print nom,np,nd,queue,nn
            print nn
        except PBSError, detail:
            print detail
        pass
Exemple #39
0
def main():

    p = PBSQuery()
    p.new_data_structure()
    nodes = p.getnodes()

    for id in nodes:
        try:
            #np = nodes[id].status.physmem[0]
            #queue = nodes[id].properties[0]
            #print np,queue
            print nodes[id].status.loadave
        #print nodes[id].status.ncpus
        #if hasattr(nodes[id],"jobs"):
        #    for job in nodes[id].jobs :
        #        print job
        except PBSError, detail:
            print detail
        pass
def main():
    """Main script."""

    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users',
                        None, 'store_true', False),
        'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None),
        'account_page_url': ('URL of the account page where we can find the REST API', None, 'store', None)
    }
    opts = ExtendedSimpleOption(options)

    try:
        now = datetime.datetime.utcnow()
        timestamp = now - datetime.timedelta(days=1)
        client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/")
        active_users, inactive_users = client.get_accounts()


        grace_users = []
        for a in active_users:
            try:
                if a.expiry_date and datetime.datetime.strptime(a.expiry_date, "%Y-%m-%d") - now < datetime.timedelta(days=7):
                    grace_users.append(a)
            except AttributeError as err:
                logger.debug("Account %s does not have expiry date", a.vsc_id)


        pbs_query = PBSQuery()

        t = time.ctime()
        jobs = pbs_query.getjobs()  # we just get them all

        removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users)
        removed_running = remove_running_jobs(jobs, inactive_users)

        if opts.options.mail_report and not opts.options.dry_run:
            if len(removed_queued) > 0 or len(removed_running) > 0:
                mail_report(t, removed_queued, removed_running)
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
    def _get_ppn(self):
        """Guess PBS' `ppn` value for a full node."""
        # cache this value as it's not likely going to change over the
        # `eb` script runtime ...
        if not self._ppn:
            pq = PBSQuery()
            node_vals = pq.getnodes().values()  # only the values, not the names
            interesting_nodes = ('free', 'job-exclusive',)
            res = {}
            for np in [int(x['np'][0]) for x in node_vals if x['state'][0] in interesting_nodes]:
                res.setdefault(np, 0)
                res[np] += 1

            # return most frequent
            freq_count, freq_np = max([(j, i) for i, j in res.items()])
            self.log.debug("Found most frequent np %s (%s times) in interesting nodes %s" % (freq_np, freq_count, interesting_nodes))

            self._ppn = freq_np

        return self._ppn
Exemple #42
0
class PBSScheduler(AbstractClusterScheduler):
    def __init__(self, *args, **kwargs):
        self.pbsquery = PBSQuery()
        super(PBSScheduler, self).__init__(*args, **kwargs)

    def output_regexp(self):
        return r'(^\d+)'

    def submit_command(self, output_file, job_name):
        # Note for posterity: ssh bint01 "source BLAH && qsub BLAH" doesn't work
        #return """ssh bint01 "/usr/syscom/nsg/opt/torque/4.2.6/bin/qsub -S /bin/bash -N %s -e %s -o %s -j oe" """ % \
        #(job_name, output_file, output_file)
        #return """qsub -S /bin/bash -N %s -e %s -o %s -j oe" """ % (job_name, output_file, output_file)
        return """qsub -N %s -e %s -o %s -j oe """ % (job_name, output_file, output_file)
        # ' '.join(['ssh', 'bint01', '"qsub', '-S', '/bin/bash',
        #            '-N', "%s" % (job_name),
        #            '-e', output_file,
        #            '-o', output_file,
        #            '-j', 'oe"',
        #        ])

    def alive(self, process_id):
        alive = False
        try:
            status = self.pbsquery.getjob(str(process_id))['job_state'][0]            
        except:
            # job not found
            status = -1
            sys.stderr.write("EXC: %s\n" % str(sys.exc_info()[0]))
            sys.stderr.write("Could not find job for process id %d\n" % process_id)

        if status == 'Q':
            sys.stderr.write("Job %d waiting in queue.\n" % (process_id))
            alive = True
        elif status == 'R':
            sys.stderr.write("Job %d is running.\n" % (process_id))
            alive = True
        elif status in ['H','S']:
            sys.stderr.write("Job %d is held or suspended.\n" % (process_id))
            alive = False

        if not alive:
            try:
                # Kill the job.
                c = pbs.pbs_connect(pbs.pbs_default())
                result = pbs.pbs_deljob(c, str(process_id))                    
                sys.stderr.write("Killed job %d.\n" % (process_id))
            except:
                sys.stderr.write("Failed to kill job %d.\n" % (process_id))

            return False
        else:
            return True
Exemple #43
0
class PBSScheduler(AbstractClusterScheduler):
    def __init__(self, *args, **kwargs):
        self.pbsquery = PBSQuery()
        super(PBSScheduler, self).__init__(*args, **kwargs)

    def output_regexp(self):
        return r'(^\d+)'

    def submit_command(self, output_file, job_name):
        # Note for posterity: ssh bint01 "source BLAH && qsub BLAH" doesn't work
        return """ssh bint01 "/usr/syscom/nsg/opt/torque/4.2.6/bin/qsub -S /bin/bash -N %s -e %s -o %s -j oe" """ % \
               (job_name, output_file, output_file)
        # ' '.join(['ssh', 'bint01', '"qsub', '-S', '/bin/bash',
        #            '-N', "%s" % (job_name),
        #            '-e', output_file,
        #            '-o', output_file,
        #            '-j', 'oe"',
        #        ])

    def alive(self, process_id):
        alive = False
        try:
            status = self.pbsquery.getjob(str(process_id))['job_state'][0]
        except:
            # job not found
            status = -1
            sys.stderr.write("EXC: %s\n" % str(sys.exc_info()[0]))
            sys.stderr.write("Could not find job for process id %d\n" %
                             process_id)

        if status == 'Q':
            sys.stderr.write("Job %d waiting in queue.\n" % (process_id))
            alive = True
        elif status == 'R':
            sys.stderr.write("Job %d is running.\n" % (process_id))
            alive = True
        elif status in ['H', 'S']:
            sys.stderr.write("Job %d is held or suspended.\n" % (process_id))
            alive = False

        if not alive:
            try:
                # Kill the job.
                c = pbs.pbs_connect(pbs.pbs_default())
                result = pbs.pbs_deljob(c, str(process_id))
                sys.stderr.write("Killed job %d.\n" % (process_id))
            except:
                sys.stderr.write("Failed to kill job %d.\n" % (process_id))

            return False
        else:
            return True
Exemple #44
0
    def getModelQueues(pbsServer):
        resultQueues = []
        pQuery = PBSQuery()

        try:
            queues = pQuery.getqueues()
            for queueName, pbsQueue in queues.items():
                customQueue = PBSQueue(server=pbsServer, name=queueName)
                try:
                    customQueue.type = TorqueService._listToStr(pbsQueue['queue_type'], '|')
                except KeyError:
                    pass
                try:
                    customQueue.total_jobs = TorqueService._listToInt(pbsQueue[pbs.ATTR_total])
                except KeyError:
                    pass
                try:
                    customQueue.running_jobs = int(TorqueService._strToDict(pbsQueue[pbs.ATTR_count][0])['Running'])
                except KeyError:
                    pass
                try:
                    customQueue.queued_jobs = int(TorqueService._strToDict(pbsQueue[pbs.ATTR_count][0])['Queued'])
                except KeyError:
                    pass
                try:
                    customQueue.resource_walltime = TorqueService._listToStr(pbsQueue[pbs.ATTR_rescdflt]['walltime'],
                        '|')
                except KeyError:
                    pass
                try:
                    customQueue.resource_nodes = TorqueService._listToInt(pbsQueue[pbs.ATTR_rescdflt]['nodes'])
                except KeyError:
                    pass

                resultQueues.append(customQueue)
        except PBSError as pbsErr:
            print(pbsErr)

        return resultQueues
Exemple #45
0
def main():
    p = PBSQuery()
    p.new_data_structure()
    nodes = p.getnodes()
    l=list()
    for id in nodes:
        try:
            queue = nodes[id].properties[0]
            state = nodes[id].state[0]
            power = nodes[id].power_state[0]
            np = nodes[id].np[0]
            name = nodes[id].name
            if hasattr(nodes[id],"jobs"):
                jobs = nodes[id].jobs[0].split('/')
		if len(jobs) > 1:
                    jobs = jobs[1]
	        else:
		    jobs = jobs[0]	
            else:
                jobs = "none"
            l.append([name,state,power,queue,np,jobs])
        except PBSError, detail:
            print detail
        pass
Exemple #46
0
def main():
    """
    Main script.
    """

    options = {
        "jobid": ("The PBS_JOBID of the job for which we want information",
                  None, "store", None),
        "information":
        ("Comma-separated list of the job info to print. "
         "Entries of the format input_key:output_key", None, "store", None),
    }
    opts = simple_option(options)

    if not opts.options.jobid:
        logger.error("jobid is a required option. Bailing.")
        sys.exit(1)

    pquery = PBSQuery()
    current_job = pquery.getjob(opts.options.jobid)

    s = transform_info(current_job, opts.options.information)

    print "\n".join(s)
    def __init__(self, pbs_server=None, debug_p=False):
        """Creates a QstatViewer object. Arguments:
           - pbs_server : FQDN of the TORQUE server to query (string)"""
        self.debug_p = debug_p

        self.nodes = {}
        self.jobs = {}
        self.queues = {}

        self.pbsquery = PBSQuery(pbs_server)

        self.servername = self.pbsquery.get_server_name()

        self.__make_server()
        self.__make_queues()
        self.__make_jobs()
        self.__make_nodes()
Exemple #48
0
 def __init__(self, *args, **kwargs):
 	self.pbsquery = PBSQuery()
     super(PBSScheduler, self).__init__(*args, **kwargs)
Exemple #49
0
        for p in ['started', 'enabled', 'acl_group_enable']:
            if p in pbsqueue_dict:
                if pbsqueue_dict[p][0] == 'True':
                    self.__dict__[p] = True
                else:
                    self.__dict__[p] = False

        if 'state_count' in pbsqueue_dict:
            self.state_count = {}
            state_counts = pbsqueue_dict['state_count'][0].strip().split(' ')
            for s in state_counts:
                name, count = s.split(':')
                self.state_count[name] = int(count)



    def __str__(self):
        return str(self.__dict__)


if __name__ == '__main__':
    pq = PBSQuery()
    queues = {}
    for k,v in sorted(pq.getqueues().iteritems()):
        queues[k] = Queue(name=k, pbsqueue_dict=v)


    for queuename, queue in sorted(queues.iteritems()):
        print queuename, ':', queue

Exemple #50
0
#!/usr/bin/env python
#
# Author: Bas van der Vlies <*****@*****.**>
# Date  : 17 Aug 2001 
# Desc. : Simple pbsnodes -a
#
# SVN info:
#   $Id: node_event_parse.py 287 2012-12-20 11:03:07Z bas $
#   $URL: https://oss.trac.surfsara.nl/pbs_python/svn/tags/4.6.0/examples/node_event_parse.py $ 
#
#
#


import pbs
import sys
from PBSQuery import PBSQuery

p = PBSQuery()
node = p.getnode('apccl02.in2p3.fr')
print node['event']
class QstatViewer:
    """
    Presents a nicer (?) interface to PBSQuery
    The two main member objects are:
    * jobs -- a dictionary with job ID (as str) as the key, 
              and the corresponding Job object as the value
    * nodes -- a dictionary with node name as the key, 
               and a set of corresponding job IDs (of jobs 
               running on node)
    """
    def __init__(self, pbs_server=None, debug_p=False):
        """Creates a QstatViewer object. Arguments:
           - pbs_server : FQDN of the TORQUE server to query (string)"""
        self.debug_p = debug_p

        self.nodes = {}
        self.jobs = {}
        self.queues = {}

        self.pbsquery = PBSQuery(pbs_server)

        self.servername = self.pbsquery.get_server_name()

        self.__make_server()
        self.__make_queues()
        self.__make_jobs()
        self.__make_nodes()


    def __make_nodes(self):
        """Make dict with node names as keys, and list of job objects as values"""

        # make list of jobids running on the node
        #node_jobs = {}
        #for jobid,job in self.jobs.iteritems():
        #    if job.exec_host:
        #        for node_cpu in job.exec_host:
        #            node = node_cpu.split('/')[0]
        #            if node not in node_jobs:
        #                node_jobs[node] = []
        #            else:
        #                node_jobs[node].append(jobid)

        rawnodes = self.pbsquery.getnodes()
        for n,s in rawnodes.iteritems():
            self.nodes[n] = Node(name=n, pbsnodes_dict=dict(s), debug_p=self.debug_p)


    def __make_jobs(self):
        """Make dict with job IDs as keys, and job properties as values"""
        rawjobs = self.pbsquery.getjobs()
        for j,p in rawjobs.iteritems():
            self.jobs[j] = Job(id=j, pbsjobs_dict=dict(p), debug_p=self.debug_p)

    def __make_queues(self):
        """make dict with queue names as keys, and queue properties as values"""
        rawqueues = self.pbsquery.getqueues()
        for q,p in rawqueues.iteritems():
            self.queues[q] = Queue(name=q, pbsqueue_dict=p)

    def __make_server(self):
        self.__serverinfo = self.pbsquery.get_serverinfo()[self.servername]
        if self.debug_p:
            print 'FOOBAR: self.serverinfo =', self.__serverinfo
        for k,v in self.__serverinfo.iteritems():
            self.__dict__[k] = None
            if k == 'state_count':
                # Example of state_count: Transit:0 Queued:-6458 Held:6383 Waiting:0 Running:964 Exiting:0
                self.__dict__[k] = {}
                vals = v[0].strip().split(' ')
                for state in vals:
                    statename = state.split(':')[0]
                    stateval  = int(state.split(':')[1])
                    self.__dict__[k][statename] = stateval
            elif k == 'resources_default':
                v['mem'] = Memory(v['mem'][0])
                v['pmem'] = Memory(v['pmem'][0])
                v['cput'] = pbstimestr_to_timedelta(v['cput'][0])
                v['walltime'] = pbstimestr_to_timedelta(v['walltime'][0])
                self.__dict__[k] = v
            elif k == 'resources_assigned':
                if 'mem' in v:
                    v['mem'] = Memory(v['mem'][0])

                if 'vmem' in v:
                    v['vmem'] = Memory(v['vmem'][0])

                if 'ncpus' in v:
                    v['ncpus'] = int(v['ncpus'][0])
                
                if 'nodect' in v:
                    v['nodect'] = int(v['nodect'][0])

                self.__dict__[k] = v
            elif k == 'scheduling' or k == 'query_other_jobs':
                if v[0] == 'True':
                    v[0] = True
                elif v[0] == 'False':
                    v[0] = False
                self.__dict__[k] = v[0]
            elif k == 'scheduler_iteration':
                self.__dict__[k] = datetime.timedelta(seconds=int(v[0]))
            elif k == 'next_job_number' or k == 'node_check_rate' or k == 'tcp_timeout' or k == 'total_jobs':
                self.__dict__[k] = int(v[0])
            elif len(v) == 1:
                self.__dict__[k] = v[0]
            else:
                self.__dict__[k] = v



    def get_job(self, jobid):
        """Queries the queue for jobid"""
        j = self.pbsquery.getjob(jobid)
        if self.debug_p:
            print 'ALOHA: ',
            print j.__dict__['data']

        if 'data' in j.__dict__:
            return Job(id=jobid, pbsjobs_dict=dict(j), debug_p=self.debug_p)
        else:
            return None

    def jobs_by_user(self, username=None):
        """Returns a dict of jobs (keyed by jobid) belonging to username"""
        retval = {}
        if not username:
            retval = None
        else:
            for jobid,job in self.jobs.iteritems():
                if job.owner == username:
                    retval[jobid] = job
        return retval

    def nodes_with_property(self, prop):
        """Returns a dict of nodes (keyed by nodename) having the given property string"""
        retval = {}
        if prop:
            for nodename,node in self.nodes.iteritems():
                if prop in node.properties:
                    retval[nodename] = node
        else:
            retval = self.nodes
        return retval

    def nodes_in_clan(self, clan):
        """Returns a dict of nodes (keyed by nodename) belonging to the given clan"""
        retval = {}
        if clan:
            for nodename,node in self.nodes.iteritems():
                if clan == node.clan:
                    retval[nodename] = node
        else:
            retval = self.nodes
        return retval

    def __unicode__(self):
        if self.debug_p:
            print 'FOOBAR: type(self.jobs) =', type(self.jobs)
            print 'FOOBAR: self.jobs =', self.jobs
        job_dict_list = []
        for k,v in self.jobs.iteritems():
            job_dict_list.append(str(v))
        return str(job_dict_list)

    def __str__(self):
        return self.__unicode__()
Exemple #52
0
                # get a handle
                conn = pbs.pbs_connect(pbs_server)

                # queue it
                if os.access(job_file, os.R_OK):
                    log.debug(
                        "submitting file %s with output %s and error %s" %
                        (job_file, ofile, efile))
                    log.debug("command is: %s" % command_line)
                    job_id = pbs.pbs_submit(conn, job_attrs, job_file, None,
                                            None)

                    # monitor
                    if job_id:
                        p = PBSQuery()
                        job_data = p.getjob(job_id)
                        old_state = job_data[job_id]["job_state"]
                        log.debug("initial state is %s" % old_state)
                        running = False
                        while True:
                            job_data = p.getjob(job_id)
                            if not job_data:
                                break
                            state = job_data[job_id]["job_state"]
                            if state != old_state:
                                log.debug("job state changed from %s to %s" %
                                          (old_state, state))
                            if state == "R" and not running:
                                running = True
                                for data in out_data.values():
#!/usr/bin/python

from PBSQuery import PBSQuery, PBSError
server = 'grid63.lal.in2p3.fr'

try:
    p=PBSQuery(server)
    pbs=p.get_serverinfo()
    nodes=p.getnodes()
    jobs=p.getjobs()
    queues=p.getqueues()
except PBSError, e:
    print "<h3>Error connecting to PBS server:</h3><tt>",e,"</tt>"
    sys.exit(1)


print ""
print "Server info:"
print pbs

print ""
print "Nodes:"
for node in nodes.keys():
  print "********** %s *********" % node
  print nodes[node]

print ""
print "Jobs:"
for job in jobs.keys():
  print "********** Job %s *********" % job
  print jobs[job]