Esempio n. 1
0
    def is_it_exclusive(job):
        """
        check to see if the job requested exclusive, or if the
        nodes are marked exclusive.  This needs to be passed
        to ATOM.
        """
        place = str(job.Resource_List["place"])
        log_with_caller(pbs.EVENT_DEBUG4, "place is %s" % place)

        # See if the node sharing value has exclusive
        vn = pbs.server().vnode(pbs.get_local_nodename())
        sharing = vn.sharing
        log_with_caller(pbs.EVENT_DEBUG4, "The sharing value is %s type %s" %
                        (str(sharing), str(type(sharing))))

        # Uses the same logic as the scheduler (is_excl())
        if sharing == pbs.ND_FORCE_EXCL or sharing == pbs.ND_FORCE_EXCLHOST:
            return True

        if sharing == pbs.ND_IGNORE_EXCL:
            return False

        if any(s.startswith('excl') for s in place.split(':')):
            return True
        if any(s.startswith('shared') for s in place.split(':')):
            return False

        if (sharing == pbs.ND_DEFAULT_EXCL or
            sharing == pbs.ND_DEFAULT_EXCLHOST):
            return True

        if sharing == pbs.ND_DEFAULT_SHARED:
            return False

        return False
def check_pbs():
    vnodes = pbs.server().vnodes()
    free = 0
    for v in vnodes:
        if v.resources_available["vntype"] and \
           v.resources_available["vntype"] != "cray_login" and \
           ((v.state == pbs.ND_FREE) or (v.state == pbs.ND_JOB_EXCLUSIVE) or (v.state == pbs.ND_RESV_EXCLUSIVE)):
            free += 1
    return free
def check_pbs():
    vnodes = pbs.server().vnodes()
    free = 0
    for v in vnodes:
        if v.resources_available["vntype"] and \
           v.resources_available["vntype"] != "cray_login" and \
           ((v.state == pbs.ND_FREE) or (v.state == pbs.ND_JOB_EXCLUSIVE) or (v.state == pbs.ND_RESV_EXCLUSIVE)):
            free += 1
    return free
Esempio n. 4
0
def _svr_vnode(name):
    # Return a vnode object obtained from the server by name.
    # Save the values in a global dictionary for future use.
    global pmi_pbsconf
    if "pmi_pbsvnodes" not in globals():
        global pmi_pbsvnodes
        pmi_pbsvnodes = dict()
        for vn in pbs.server().vnodes():
            pmi_pbsvnodes[vn.name] = vn
    return pmi_pbsvnodes[name]
Esempio n. 5
0
def _svr_vnode(name):
    # Return a vnode object obtained from the server by name.
    # Save the values in a global dictionary for future use.
    global pmi_pbsconf
    if "pmi_pbsvnodes" not in globals():
        global pmi_pbsvnodes
        pmi_pbsvnodes = dict()
        for vn in pbs.server().vnodes():
            pmi_pbsvnodes[vn.name] = vn
    return pmi_pbsvnodes[name]
Esempio n. 6
0
def getJobs():
    job_lst = []
    jobs = {}
    s = pbs.server()
    if s.vnode(local_node).jobs:
        job_lst = s.vnode(local_node).jobs.split(',')
        for job in job_lst:
            job = re.sub('/\d+', '', job)
            job = job.replace(" ", "")
            jobs[job] = 1
    return jobs
Esempio n. 7
0
    def __init__(self, e):
        self.rc = -1
        self.e = e

        self.parse_cfg()

        self.nodename = pbs.get_local_nodename()

        try:
            self.node = pbs.server().vnode(self.nodename)
        except:
            pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; failed to get node info from server")
            self.e.reject()

        self.vnl = self.e.vnode_list
Esempio n. 8
0
def get_local_node(name):
    # Get host names from /etc/hosts and return matching name for the MoM
    try:
        (hostname, aliaslist, _) = socket.gethostbyname_ex(name)
    except:
        return None
    aliaslist.append(hostname)
    # Search for possible match in server vnode list.
    pbsvnodes = dict()
    for vn in pbs.server().vnodes():
        pbsvnodes[vn.name] = vn
    for n in aliaslist:
        if n in pbsvnodes:
            return pbsvnodes[n]
    return None
Esempio n. 9
0
if a nastran job failed, it only keeps the solver deck 'bdf' file , 'f04' and 'f06' for details,  and remove other temp file in the solver to save time 

default order of this hook is 1

if you want to keep more files with failed nastran jobs, please add extra file extentions to  reserve_ext 
please follow those step to take effect
	qmgr -c " c h nastran_ep" 
	qmgr -c " s h nastran_ep event=execjob_epilogue" 
	qmgr -c " i h nastran_ep application/x-python default nastran_ep.py"
'''
import os 
import sys
import pbs 

j=pbs.event().job
j_query=pbs.server().job(j.id);


soft=j_query.Resource_List["software"].__str__();


if not soft == 'Nastran' : 
	sys.exit(0);
	

reserve_ext=['f04','f06','bdf'];

j_id=j.id.split(".")[0];
reserve_ext.append("o"+j_id);
reserve_ext.append("e"+j_id);
    return None


# Main
pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: START")

# Determine the minute of the time.
now = time.strftime("%M", time.gmtime())

if ADDITIONAL_DEBUG:
    pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: ADDITIONAL DEBUG, the minute right now = %s" % (now))

# Identify the cray_login nodes, which are running the pbs_mom and put them
# into a list.
cray_login = []
vnodes = pbs.server().vnodes()
for v in vnodes:
    if v.resources_available["vntype"] and \
       v.resources_available["vntype"] == "cray_login" and \
       ((v.state == pbs.ND_FREE) or (v.state == pbs.ND_JOB_EXCLUSIVE) or (v.state == pbs.ND_RESV_EXCLUSIVE)):
        cray_login.append(str(v))

if ADDITIONAL_DEBUG:
    for cl in cray_login:
        pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: ADDITIONAL DEBUG, Eligible Cray Login Nodes = %s" % (cl))

# Determine the total number of cray_login nodes
cray_login_total = len(cray_login)
if cray_login_total > 0:
    pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: Total Eligible Cray Login Nodes = %s" % (cray_login_total))
else:
Esempio n. 11
0
    return False


# MAIN LOGIC BEGIN
try:
    queue_type = extract_queue_type()

    walltime = extract_walltime()
    selection = extract_selection()

    if queue_type == "private":
        if pbs.event().job.project:
            pbs.event().reject(
                "Express project codes can not be used with private queues")

        queue = pbs.server().queue(pbs.event().job.queue.name)
        if not queue:
            pbs.event().reject("Invalid queue name")

        permitted_groups = queue.resources_available['permitted_groups']
        if permitted_groups:
            permitted_groups = permitted_groups.split(',')
            if not test_group_membership(permitted_groups):
                pbs.event().reject(
                    "You are not authorised to use this private queue")

        fixup_mpiprocs_ompthreads(selection)
        # PQs are small and offline resources may preent jobs running
        # Prevent them becoming topjobs
        # Exclude med-bio, they can be top jobs, this lets largemem jobs compete against tiny ones
        if queue_name() != "med-bio":
        if (e.job.queue.name in ["adis_in", "power_0qa1"]):

            if (e.job.Variable_List["in_queue_credential"] != "1"):

                e.reject(
                    "Error, Make sure you use --> qsub_in <-- command to submit to the queue!"
                )

            power_0qa1_nodes_list = [
                'compute-0-166.power5', 'compute-0-167.power5'
            ]

            for qnode in power_0qa1_nodes_list:

                node = pbs.server().vnode(qnode)
                node_ncpu_total = node.resources_available["ncpus"]
                node_ncpu_used = node.resources_assigned["ncpus"]
                node_ncpu_free = node_ncpu_total - node_ncpu_used
                job_ncpus = e.job.Resource_List["ncpus"]

                if (job_ncpus > node_ncpu_free):
                    pass

                else:
                    e.accept()
                    break

            adisq = pbs.server().queue("adis")
            e.job.queue = adisq
import pbs
import sys

#qmgr -c 'create hook check_and_route_adis_0q event="queuejob"';qmgr -c 's h check_and_route_adis_0q debug=True'
#qmgr -c 'import hook check_and_route_adis_0q application/x-python default /var/spool/PBS/hooks/check_and_route_adis_0q.py' 
#qmgr -c 'd h check_and_route_adis_0q'
try:
	
	e = pbs.event()
	
	if e.job.queue:
		
		target_qname = e.job.queue.name
		
		if (target_qname in ["adis_0q"]):
		
			adisq = pbs.server().queue("adis")
			adis_used_ncpus = adisq.resources_assigned["ncpus"]
			
			if (adis_used_ncpus >= 2):
				e.job.queue = adisq
			
	
	# accept the event
				e.accept()
except:
#	e.reject("Failed to route job to queue adis_0q")
	e.accept("Failed to route job to queue adis_0q")
import pbs
import sys

try:
# Get the hook event information and parameters
# This will be for the 'queuejob' event type.
	e = pbs.event()
# Get the information for the job being queued
	j = e.job
	if j.interactive:
# Get the “interQ” queue object
		q = pbs.server().queue("live_q")
# Reset the job's destination queue
# parameter for this event
		j.queue = q
# accept the event
		e.accept()
except SystemExit:
	pass
except:
	e.reject("Failed to route job to queue live_q")
vnode = e.vnode
aoe = e.aoe

pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: Env = %s" % repr(os.environ))
pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: PBS Node = %s" % vnode)
pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: AOE = %s" % aoe)

# Provision hook will run on PBS Server but provisioning is started from Admin node, both may not run on same node.
# Check for admin node? Read from json config file.
if 'PBS_HOOK_CONFIG_FILE' in os.environ:
    import json
    config_file = os.environ["PBS_HOOK_CONFIG_FILE"]
    #pbs.logmsg(pbs.EVENT_DEBUG, "%s: Config file is %s" % (caller_name(), config_file))
    config = json.load(open(config_file, 'r'), object_hook=decode_dict)

server = pbs.server().name
admin = config['admin-node']

pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: server name = %s" % server)
pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: admin node = %s" % admin)

if admin == server:
    ret = os.system(
        "/opt/clmgr/contrib/hpcm_pbspro_connector/bin/hpcm_provision.sh " +
        aoe + " " + vnode)
    if ret != 0:
        pbs.logmsg(pbs.LOG_DEBUG,
                   "PROVISIONING: Failed - retcode = %s" % str(ret))
        e.reject("Reboot provisioning failed", ret)
    else:
        e.accept(0)
Esempio n. 16
0
sys.path += [
    '', '/opt/pbs/default/python/lib/python25.zip',
    '/opt/pbs/default/python/lib/python2.5',
    '/opt/pbs/default/python/lib/python2.5/plat-linux2',
    '/opt/pbs/default/python/lib/python2.5/lib-tk',
    '/opt/pbs/default/python/lib/python2.5/lib-dynload',
    '/opt/pbs/default/python/lib/python2.5/site-packages'
]
import subprocess
import re

e = pbs.event()
j = e.job

path = pbs.server().resources_available['store_path']

os.system('/usr/bin/scp ' + '/var/spool/PBS/spool/' + j.id + '.OU' + ' ' +
          path)
os.system('/usr/bin/scp ' + '/var/spool/PBS/spool/' + j.id + '.ER' + ' ' +
          path)

f = open("/tmp/hooks", "w")
#for i in dir(pbs.server()) :
#	f.write("%s\n" % i);
f.write(repr(pbs.server().resources_available['store_path']))
#f.write(j.Error_Path)
#f.write(j.Output_Path)

f.write('/usr/bin/scp ' + '/var/spool/PBS/spool/' + j.id + '.OU' + ' ' + path)
f.write('/usr/bin/scp ' + '/var/spool/PBS/spool/' + j.id + '.ER' + ' ' + path)
Esempio n. 17
0
                "You can only request one chunk of the form #PBS -l select=N:ncpus=X:mem=Ygb:mpiprocs=Z:ompthreads=W"
            )
        for chunk in chunks:
            nodect += int(chunk.split(":")[0])
            for rs in chunk.split(":")[1:]:
                kw = rs.split("=")[0]
                if (kw not in list_of_resources):
                    pbs.event().reject(
                        "Select statements can only contain the resources: " +
                        ", ".join(list_of_resources))
                if (kw == "ncpus"):
                    ncpus = int(rs.split("=")[1])

        matched = False
        if nodect <= 18 and ncpus == 24 and walltime <= pbs.duration("2:0:0"):
            pbs.event().job.queue = pbs.server().queue("short")
            matched = True
        if not matched and nodect >= 2 and nodect <= 18 and ncpus == 16 and walltime <= pbs.duration(
                "72:0:0"):
            pbs.event().job.queue = pbs.server().queue("general")
            matched = True
        if not matched and nodect >= 72 and nodect <= 270 and ncpus in [
                28
        ] and walltime <= pbs.duration("24:0:0"):
            pbs.event().job.queue = pbs.server().queue("capability")
            matched = True
        if not matched and nodect >= 18 and nodect <= 72 and ncpus in [
                24, 28
        ] and walltime <= pbs.duration("48:0:0"):
            pbs.event().job.queue = pbs.server().queue("large")
            matched = True
Esempio n. 18
0
# To instantiate this hook, specify the following:
#    qmgr -c "create hook load_balance event=exechost_periodic,freq=10"
#    qmgr -c "import hook load_balance application/x-python default load_balance.py"
import pbs
import os
import re

ideal_load = 1.5
max_load = 2.0


# get_la: returns a list of load averages within the past 1-minute, 5-minute,
#         15-minutes range.
def get_la():
    line = os.popen("uptime").read()
    r = re.search(r'load average: (\S+), (\S+), (\S+)$', line).groups()
    return map(float, r)


local_node = pbs.get_local_nodename()

vnl = pbs.event().vnode_list
current_state = pbs.server().vnode(local_node).state
mla = get_la()[0]
if (mla >= max_load) and ((current_state & pbs.ND_OFFLINE) == 0):
    vnl[local_node].state = pbs.ND_OFFLINE
    vnl[local_node].comment = "offlined node as it is heavily loaded"
elif (mla < ideal_load) and ((current_state & pbs.ND_OFFLINE) != 0):
    vnl[local_node].state = pbs.ND_FREE
    vnl[local_node].comment = None
Esempio n. 19
0
import pbs
import sys

#qmgr -c 'create hook check_fair_share_q event="queuejob"';qmgr -c 's h check_fair_share_q debug=True'
#qmgr -c 'import hook check_fair_share_q application/x-python default /var/spool/PBS/hooks/check_fair_share_q.py'
#qmgr -c 'd h check_fair_share_q'
try:

    e = pbs.event()

    if e.job.queue:

        target_qname = e.job.queue.name

        if (target_qname in ["fair_test"]):

            pbs.server().job()
            e.accept()

#			adisq = pbs.server().queue("adis")
#			adis_used_ncpus = adisq.resources_assigned["ncpus"]

#			if (adis_used_ncpus >= 2):
#				e.job.queue = adisq

# accept the event
except:
    #	e.reject("Failed to route job to queue adis_0q")
    e.accept()
Esempio n. 20
0
# Main
sys.path.append(PBS_EXEC + '/python/lib/python2.7')
sys.path.append(PBS_EXEC + '/python/lib/python2.7/lib-dynload')
from subprocess import Popen, PIPE
from sets import Set

try:
  e = pbs.event()
  j = e.job
 
  # Get the username
  who = str(e.requestor)

  # Get queue
  if j.queue == '':
      q = pbs.server().default_queue.name
  else:
      q = j.queue.name

  # Get permitted_groups or accept the job

  permitted_groups = pbs.server().queue(q).resources_available['permitted_groups']
  if permitted_groups == None:
    e.accept()
  else:
    permitted_groups = permitted_groups.split(',')

  # Build a list of users from all permitted groups
  users = Set([])
  try:
    for g in permitted_groups:
Esempio n. 21
0
### resource.
if ( 'PBS_GET_IBWINS' in myjob.Variable_List ):
   pbs.logmsg(pbs.LOG_WARNING, "User requested that netwins be calculated")
else:
   sys.exit()

### script Defaults area
### Reasonable internal defaults for networks, instances, and max instances
default_networks = 2
max_instances = 4
instances = 1
msg_api = ""
mpiprocs=1

# Put this closer to actual reference for efficiency.
mysrv = pbs.server()

debug_me = False

# If resources_available.debug_hooks contains the name of this hook, then we
# turn on the debug flag.
if ( "debug_hooks" in mysrv.resources_available and 
      my_name in str(
         mysrv.resources_available['debug_hooks']).split(',')
      ):
   debug_me=True

def dbg_svr_log(string):
   '''quick function to wrap debug logging to the server'''
   # Abort if the hook_debug value is not set
   if(debug_me):
Esempio n. 22
0
import pbs
import sys

#qmgr -c 'create hook check_fair_share_q event="queuejob"';qmgr -c 's h check_fair_share_q debug=True'
#qmgr -c 'import hook check_fair_share_q application/x-python default /var/spool/PBS/hooks/check_fair_share_q.py' 
#qmgr -c 'd h check_fair_share_q'
try:
	
	e = pbs.event()
	
	if e.job.queue:
		
		target_qname = e.job.queue.name
		
		if (target_qname in ["fair_test"]):
		
			pbs.server().job()			
		        e.accept()
	
#			adisq = pbs.server().queue("adis")
#			adis_used_ncpus = adisq.resources_assigned["ncpus"]
			
#			if (adis_used_ncpus >= 2):
#				e.job.queue = adisq
			
	
	# accept the event
except:
#	e.reject("Failed to route job to queue adis_0q")
	e.accept()
Esempio n. 23
0
                resources[node_i]['ncpus'] = 0
            resources[node_i]['ncpus'] += int(m.group(1))

    return resources

try:
    e = pbs.event()
    if e.type == pbs.RUNJOB:
        j = e.job

        if j.queue and re.match('^[RM]{1}[0-9]+', j.queue.name):
            e.accept()

        resources = parse_exec_vnode(j.exec_vnode)
        for nodename in resources.keys():
            node = pbs.server().vnode(nodename)

            available_ncpus = node.resources_available['ncpus']
            assigned_ncpus = node.resources_assigned['ncpus']

            try:
                 requested_ncpus = resources[nodename]['ncpus']
            except:
                 requested_ncpus = 1

            if assigned_ncpus + requested_ncpus > available_ncpus:
                now = time.strftime("%Y%m%d%H%M%S", time.localtime())
                jobs = []

                filename = "/tmp/pbs_overcommit_detector_%s_%s_overcommit_by_%s" % (now, nodename, j.id)
Esempio n. 24
0
import pbs
import sys

try:
    # Get the hook event information and parameters
    # This will be for the 'queuejob' event type.
    e = pbs.event()
    # Get the information for the job being queued
    j = e.job
    if j.interactive:
        # Get the “interQ” queue object
        q = pbs.server().queue("live_q")
        # Reset the job's destination queue
        # parameter for this event
        j.queue = q
        # accept the event
        e.accept()
except SystemExit:
    pass
except:
    e.reject("Failed to route job to queue live_q")
Esempio n. 25
0
# specified in 'high_priority_queue', and also tells the server to restart
# the scheduling cycle. This  for faster qsub -Is throughput.
#
# Prerequisite:
#    Site must define a "high" queue as follows:
#        qmgr -c "create queue high queue_type=e,Priority=150
#        qenable high
#        qstart high
#    NOTE:
#        A) 150 is the default priority for an express (high) queue.
#           This will have the interactive job to preempt currently running
#           work.
#        B) If site does not want this, lower the priority of the high
#           priority queue.  This might not cause the job to run right away,
#           but will try.
#
#    This hook is instantiated as follows:
#        qmgr -c "create hook rapid event=queuejob"
#        qmgr -c "import hook rapid_inter application/x-python default rapid_inter.py"
import pbs

high_priority_queue = "high"

e = pbs.event()
if e.job.interactive:
    high = pbs.server().queue(high_priority_queue)
    if high != None:
        e.job.queue = high
        pbs.logmsg(pbs.LOG_DEBUG, "quick start interactive job")
        pbs.server().scheduler_restart_cycle()
Esempio n. 26
0
            mynodename = pbs.get_local_nodename()
            pbs.logmsg(pbs.EVENT_DEBUG3, "got node: %s" % (mynodename))
            myvnode.state = pbs.ND_FREE
            pbs.logmsg(pbs.EVENT_DEBUG,
                       "Changed node state to ND_FREE: %s" % (mynodename))
            myvnode.comment = None
            pbs.logmsg(pbs.EVENT_DEBUG, "Onlined node: %s" % (mynodename))

        else:
            return True


if __name__ == "__builtin__":
    start = time.time()
    pbs.logmsg(pbs.EVENT_DEBUG3, "Starting the node health check")
    c = NodeHealthCheck()

    if pbs.event().type == pbs.EXECHOST_PERIODIC:
        vnode = pbs.server().vnode(c.host)
        if vnode.state == pbs.ND_OFFLINE and vnode.comment.startswith(
                '-attn_nhc:'):
            # Still need to flesh out CheckOfflineNode function
            c.CheckOfflineNode()
        else:
            c.CheckNodePeriodic()
    else:
        c.CheckNode()

    pbs.logmsg(pbs.EVENT_DEBUG3, "Finished check disk hook: %0.5lf (s)" %
               (time.time() - start))
	
	e = pbs.event()	
	
	if e.job.queue:
	
		if (e.job.queue.name in ["adis_in","power_0qa1"]):
		
			if (e.job.Variable_List["in_queue_credential"] != "1"):
				
		        	e.reject("Error, Make sure you use --> qsub_in <-- command to submit to the queue!")
			
			power_0qa1_nodes_list = ['compute-0-166.power5','compute-0-167.power5']
			
			for qnode in power_0qa1_nodes_list:
			
				node=pbs.server().vnode(qnode)
				node_ncpu_total = node.resources_available["ncpus"]
				node_ncpu_used = node.resources_assigned["ncpus"]
				node_ncpu_free = node_ncpu_total - node_ncpu_used
				job_ncpus = e.job.Resource_List["ncpus"]
	
				if ( job_ncpus > node_ncpu_free ):
					pass
			
				else:
					e.accept()
					break
	
			adisq = pbs.server().queue("adis")
			e.job.queue = adisq	
Esempio n. 28
0
		if pbs.event().job.project:
			pbs.event().reject( "Express project codes can not be used with reservations" )
		pbs.event().accept()



	# private queues
	# check group membership
	# fix ompthreads/mpiprocs
	# check that selection is valid
	if queue_type == "private":
		if pbs.event().job.project:
			pbs.event().reject( "Express project codes can not be used with private queues" )
		pbs.event().accept()

		queue = pbs.server().queue(pbs.event().job.queue.name)
		if not queue:
			pbs.event().reject("Invalid queue name")

		permitted_groups = queue.resources_available["permitted_groups"]
		if permitted_groups:
			permitted_groups = permitted_groups.split(",")
			if not test_group_membership( permitted_groups ):
				pbs.event().reject("You are not authorised to use this private queue")
		fixup_mpiprocs_ompthreads( selection )

	#	check_pq_restriction( selection, walltime, pbs.event().job.queue.name )
		pbs.event().accept()

	# Express version 0 - accept anything provided the user is in an exp-XXX group
	express = False
Esempio n. 29
0
#
# To instantiate this hook, specify the following:
#    qmgr -c "create hook load_balance event=exechost_periodic,freq=10"
#    qmgr -c "import hook load_balance application/x-python default load_balance.py"
import pbs
import os
import re

ideal_load=1.5
max_load=2.0

# get_la: returns a list of load averages within the past 1-minute, 5-minute,
#         15-minutes range.
def get_la():
    line=os.popen("uptime").read()
    r = re.search(r'load average: (\S+), (\S+), (\S+)$', line).groups()
    return map(float, r)

local_node = pbs.get_local_nodename()

vnl = pbs.event().vnode_list
current_state = pbs.server().vnode(local_node).state
mla = get_la()[0]
if (mla >= max_load) and ((current_state & pbs.ND_OFFLINE) == 0):
    vnl[local_node].state = pbs.ND_OFFLINE
    vnl[local_node].comment = "offlined node as it is heavily loaded"
elif (mla < ideal_load) and ((current_state & pbs.ND_OFFLINE) != 0):
    vnl[local_node].state = pbs.ND_FREE
    vnl[local_node].comment = None

Esempio n. 30
0
            pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(self.host))
            mynodename = pbs.get_local_nodename()
            myvnode = pbs.event().vnode_list[mynodename]
            mynodename = pbs.get_local_nodename()
            pbs.logmsg(pbs.EVENT_DEBUG3,"got node: %s"%(mynodename))
            myvnode.state = pbs.ND_FREE
            pbs.logmsg(pbs.EVENT_DEBUG,"Changed node state to ND_FREE: %s"%(mynodename))
            myvnode.comment =  None
            pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(mynodename))
            
        else:
            return True
            
if __name__ == "__builtin__":
    start = time.time()
    pbs.logmsg(pbs.EVENT_DEBUG3,"Starting the node health check")
    c = NodeHealthCheck()

    if pbs.event().type == pbs.EXECHOST_PERIODIC:
        vnode = pbs.server().vnode(c.host)
        if vnode.state == pbs.ND_OFFLINE and vnode.comment.startswith('-attn_nhc:'):
            # Still need to flesh out CheckOfflineNode function
            c.CheckOfflineNode()
        else:
            c.CheckNodePeriodic()
    else:
        c.CheckNode()

    pbs.logmsg(pbs.EVENT_DEBUG3,"Finished check disk hook: %0.5lf (s)"%(time.time()-start))

Esempio n. 31
0
# the scheduling cycle. This  for faster qsub -Is throughput.
#
# Prerequisite:
#    Site must define a "high" queue as follows:
#        qmgr -c "create queue high queue_type=e,Priority=150
#        qenable high
#        qstart high
#    NOTE:
#        A) 150 is the default priority for an express (high) queue.
#           This will have the interactive job to preempt currently running
#           work.
#        B) If site does not want this, lower the priority of the high
#           priority queue.  This might not cause the job to run right away,
#           but will try.
#
#    This hook is instantiated as follows:
#        qmgr -c "create hook rapid event=queuejob"
#        qmgr -c "import hook rapid_inter application/x-python default rapid_inter.py"
import pbs

high_priority_queue="high"

e = pbs.event()
if e.job.interactive:
    high = pbs.server().queue(high_priority_queue)
    if high != None:
        e.job.queue = high
        pbs.logmsg(pbs.LOG_DEBUG, "quick start interactive job")
        pbs.server().scheduler_restart_cycle()

Esempio n. 32
0
now_mn = int(time.strftime("%M", time.gmtime()))
msg = []

if not os.path.isfile(XTHOSTNAME):
    msg += ["No %s file found on this host." % (XTHOSTNAME)]
    __exit_hook(0, msg)

# XTHOSTNAME file found on this host. Read it to determine our Cray hostname.

with open(XTHOSTNAME) as xthost_file:
    my_crayhost = xthost_file.readline()
    my_crayhost = my_crayhost.rstrip()
    msg += ["Processing ALPS inventory for crayhost %s" % (my_crayhost)]

start = time.time()
vnodes = pbs.server().vnodes()
vnodes_query_duration = time.time() - start
if not vnodes:
    msg += ["ALPS Inventory Check: No vnodes reported by PBS"]
    __exit_hook(1, msg)

down_states = pbs.ND_DOWN | pbs.ND_STALE | pbs.ND_STATE_UNKNOWN

for v in vnodes:
    str_v = str(v)
    vntype = " "

    if (v.state & down_states) or \
        "PBScrayhost" not in v.resources_available or \
            v.resources_available["PBScrayhost"] != my_crayhost:
        continue
Esempio n. 33
0
import pbs
import os
import re
import sys

e = pbs.event()
try:
    if e.job.in_ms_mom():
        exit_code = str(e.job.Exit_status)

        local_node = pbs.get_local_nodename()
        vnl = e.vnode_list
        current_state = pbs.server().vnode(local_node).state

        if (int(exit_code) != 0) and ((current_state == pbs.ND_OFFLINE) == 0):
            vnl[local_node].state = pbs.ND_OFFLINE
            vnl[local_node].comment = "offlined node as it is heavily loaded"

            report_file1 = str("/home/centos/pqr.txt")
            pbs.logmsg(pbs.LOG_DEBUG, "report_usage file1 is %s" % report_file1)

            fd_out1 = open(report_file1, 'w+')
            print >> fd_out1, 'To: [email protected]'
            print >> fd_out1, 'From: [email protected]'
            print >> fd_out1, 'Subject: Node Taken offline'
            print >> fd_out1, 'Node Name: = ' + pbs.server().vnode(local_node).name
            fd_out1.close()

            mail_cmd="/usr/sbin/sendmail -t \"PBS OSS\" < /home/centos/pqr.txt"
            pbs.logmsg(pbs.LOG_DEBUG, "mail_command  is %s" % mail_cmd)
            os.popen(mail_cmd)