Beispiel #1
0
def classify_job(selection, walltime, express=False, queue=None):
    names = ""
    ret = []

    for clssname in sorted(classifications.keys()):
        clssset = classifications[clssname]
        for clss in clssset:
            if (queue == None) or clssname == queue:
                if match_class(selection, walltime, clssname, clss, express):
                    clss["target_queue"] = clssname
                    ret.append(clss)
                    names = names + clssname + " "

# Always go with the first if more than one class has matched. Lexical order of classname
#	if len(ret) > 1:
#		pbs.event().reject( "Job matches multiple classes: [ " + names.strip() + "]" )

    if len(ret):
        return ret[0]
    else:
        pbs.logmsg(
            pbs.LOG_ERROR, "FAILED TO MATCH A JOB:" +
            repr(pbs.event().job.Resource_List["select"]) + " walltime " +
            repr(pbs.event().job.Resource_List["walltime"]))
        pbs.event().reject(
            "Job resource selection does not match any permitted configuration.\n      Please review the CX1 Job Sizing guidance on:\n       https://bit.ly/2AInEIj\n"
        )
Beispiel #2
0
    def __init__(self, **kwords):
        self.host = ''
        self.user = ''
        self.job_id = ''
        self.nhc_cfg = None

        # Set up the values for host and user
        pbs.logmsg(pbs.EVENT_DEBUG3,"get node name")
        self.host = pbs.get_local_nodename()

        # Read in the configurations file
        pbs_hook_cfg = pbs.hook_config_filename
        if pbs_hook_cfg == None:
            pbs.logmsg(pbs.EVENT_DEBUG3,"%s"%os.environ)
            pbs_hook_cfg = os.environ["PBS_HOOK_CONFIG_FILE"]    
        pbs.logmsg(pbs.EVENT_DEBUG3,"read config file: %s"%pbs.hook_config_filename)
        config_file = open(pbs.hook_config_filename).read()
        
        self.nhc_cfg = json.loads(config_file)
        pbs.logmsg(pbs.EVENT_DEBUG3,"config file: %s"%self.nhc_cfg)

        # Check to make sure the event has a user associated with it
        pbs.logmsg(pbs.EVENT_DEBUG3,'Event: %s'%pbs.event().type)
        if pbs.event().type != pbs.EXECHOST_PERIODIC:
            self.user = repr(pbs.event().job.Job_Owner).split("@")[0].replace("'","")
            self.job_id = pbs.event().job.id
        else:
            self.user = '******'
            self.job_id = str(time.time())

        pbs.logmsg(pbs.EVENT_DEBUG3,'Done initializing NodeHealthCheck')
Beispiel #3
0
def queue_name():
    if pbs.event().job.queue == None:
        return ""
    if isinstance(pbs.event().job.queue, str):
        return pbs.event().job.queue
    else:
        return pbs.event().job.queue.name
Beispiel #4
0
    def activate_profile(self, profile_name=None, job=None):
        self._check_pmi()
        if job is None:
            job = pbs.event().job

        try:
            ret = self.__pmi._activate_profile(profile_name, job)
            if profile_name is not None:
                hosts = _get_vnode_names(job)
                for h in hosts:
                    try:
                        pbs.event().vnode_list[h].current_eoe = profile_name
                    except:
                        pass
            return ret
        except BackendError as e:
            # get fresh set of profile names, ignore errors
            mynode = pbs.event().vnode_list[pbs.get_local_nodename()]
            if mynode.power_provisioning:
                try:
                    profiles = self.__pmi._query(
                        pbs.Power.QUERY_PROFILE)
                    names = self._map_profile_names(profiles)
                    mynode.resources_available["eoe"] = names
                    pbs.logmsg(pbs.LOG_WARNING,
                               "PMI:activate: set eoe: %s" % names)
                except:
                    pass
            raise BackendError(e)
        except InternalError as e:
            # couldn't do activation so set vnode offline
            me = pbs.get_local_nodename()
            pbs.event().vnode_list[me].state += pbs.ND_OFFLINE
            pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline")
            raise InternalError(e)
Beispiel #5
0
    def __init__(self, **kwords):
        self.host = ''
        self.user = ''
        self.job_id = ''
        self.nhc_cfg = None

        # Set up the values for host and user
        pbs.logmsg(pbs.EVENT_DEBUG3, "get node name")
        self.host = pbs.get_local_nodename()

        # Read in the configurations file
        pbs_hook_cfg = pbs.hook_config_filename
        if pbs_hook_cfg is None:
            pbs.logmsg(pbs.EVENT_DEBUG3, "%s" % os.environ)
            pbs_hook_cfg = os.environ["PBS_HOOK_CONFIG_FILE"]
        pbs.logmsg(pbs.EVENT_DEBUG3, "read config file: %s" %
                   pbs.hook_config_filename)
        config_file = open(pbs.hook_config_filename).read()

        self.nhc_cfg = json.loads(config_file)
        pbs.logmsg(pbs.EVENT_DEBUG3, "config file: %s" % self.nhc_cfg)

        # Check to make sure the event has a user associated with it
        pbs.logmsg(pbs.EVENT_DEBUG3, 'Event: %s' % pbs.event().type)
        if pbs.event().type != pbs.EXECHOST_PERIODIC:
            self.user = repr(pbs.event().job.Job_Owner).split("@")[
                0].replace("'", "")
            self.job_id = pbs.event().job.id
        else:
            self.user = '******'
            self.job_id = str(time.time())

        pbs.logmsg(pbs.EVENT_DEBUG3, 'Done initializing NodeHealthCheck')
Beispiel #6
0
    def activate_profile(self, profile_name=None, job=None):
        self._check_pmi()
        if job is None:
            job = pbs.event().job

        try:
            ret = self.__pmi._activate_profile(profile_name, job)
            if profile_name is not None:
                hosts = _get_vnode_names(job)
                for h in hosts:
                    try:
                        pbs.event().vnode_list[h].current_eoe = profile_name
                    except:
                        pass
            return ret
        except BackendError as e:
            # get fresh set of profile names, ignore errors
            mynode = pbs.event().vnode_list[pbs.get_local_nodename()]
            if mynode.power_provisioning:
                try:
                    profiles = self.__pmi._query(pbs.Power.QUERY_PROFILE)
                    names = self._map_profile_names(profiles)
                    mynode.resources_available["eoe"] = names
                    pbs.logmsg(pbs.LOG_WARNING,
                               "PMI:activate: set eoe: %s" % names)
                except:
                    pass
            raise BackendError(e)
        except InternalError as e:
            # couldn't do activation so set vnode offline
            me = pbs.get_local_nodename()
            pbs.event().vnode_list[me].state += pbs.ND_OFFLINE
            pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline")
            raise InternalError(e)
Beispiel #7
0
def extract_walltime():
	hrs=0;
	if ( pbs.event().job.Resource_List["walltime"] == None ):
		pbs.event().reject("You must specify a walltime using the format\n      -lwalltime=HH:MM:00")
	
	wt = pbs.event().job.Resource_List["walltime"]
	wt = float(wt)/ 3600.
	return wt
Beispiel #8
0
def execjob_end():

    pbs.logmsg(pbs.LOG_DEBUG, "Removing any GPUs assigned to the job")
    job = pbs.event().job.id
    vn = pbs.event().vnode_list
    # does not matter what the value is just remove the job from file after completion
    value = ""
    UpdateGpuJobs(job, value, GpuJobsPath, add=False)
Beispiel #9
0
def check_pq_restriction( selection, walltime, queue ):
	if queue not in private_queue_restrictions:
		# There wasn't a restriction for this pq, so accept it
		return

	clss = private_queue_restrictions[ queue ]
	for c in clss:
			if match_class( selection, walltime, queue, c, False ):
				return True
	pbs.event().reject( "The private queue has insufficient resources for a job this size.\nPlease consult https://selfservice.rcs.imperial.ac.uk/pqs/nodes/%s to see what's available.\nFor any queries please contact us via [email protected]" % ( queue ) )
Beispiel #10
0
 def validate_config(cls):
     """
     Validate the config file
     This will check if the unix_socket_file resolves to a file.
     """
     if not os.path.exists(cls.get_config()['unix_socket_file']):
         log_with_caller(pbs.EVENT_DEBUG4,
                         'Unix socket file does not exist, skipping hook',
                         jobid=False)
         pbs.event().accept()
Beispiel #11
0
    def __init__(self):
        PBS_SPOOL=os.path.join(pbs_conf()['PBS_MOM_HOME'], 'spool')
        self.stdout_log=os.path.join(PBS_SPOOL,
                                     '%s.OU' % str(pbs.event().job.id))
        self.stderr_log=os.path.join(PBS_SPOOL,
                                     '%s.ER' % str(pbs.event().job.id))

        if str(pbs.event().job.Join_Path) == 'oe':
            self.stderr_log=self.stdout_log
        elif str(pbs.event().job.Join_Path) == 'eo':
            self.stdout_log=self.stderr_log
Beispiel #12
0
    def __init__(self):
        PBS_SPOOL = os.path.join(pbs_conf()['PBS_MOM_HOME'], 'spool')
        self.stdout_log = os.path.join(PBS_SPOOL,
                                       '%s.OU' % str(pbs.event().job.id))
        self.stderr_log = os.path.join(PBS_SPOOL,
                                       '%s.ER' % str(pbs.event().job.id))

        if str(pbs.event().job.Join_Path) == 'oe':
            self.stderr_log = self.stdout_log
        elif str(pbs.event().job.Join_Path) == 'eo':
            self.stdout_log = self.stderr_log
Beispiel #13
0
    def deactivate_profile(self, job=None):
        self._check_pmi()

        if job is None:
            job = pbs.event().job
        if _running_excl(job):
            pbs.logjobmsg(job.id, "PMI: reset current_eoe")
            for h in _get_vnode_names(job):
                try:
                    pbs.event().vnode_list[h].current_eoe = None
                except:
                    pass
        return self.__pmi._deactivate_profile(job)
Beispiel #14
0
    def deactivate_profile(self, job=None):
        self._check_pmi()

        if job is None:
            job = pbs.event().job
        if _running_excl(job):
            pbs.logjobmsg(job.id, "PMI: reset current_eoe")
            for h in _get_vnode_names(job):
                try:
                    pbs.event().vnode_list[h].current_eoe = None
                except:
                    pass
        return self.__pmi._deactivate_profile(job)
Beispiel #15
0
def log_with_caller(sev, mes, caller=0, jobid=True):
    """
    Wrapper to pbs.logmsg with caller's name prepended

    Increment caller to get the caller of the calling function

    If jobid is true, add the jobid from the event to the log message
    """
    if jobid:
        pbs.logmsg(sev, '%s:%s:%s: %s' %
                   (pbs.event().hook_name, pbs.event().job.id,
                    caller_name(2 + caller), mes))
    else:
        pbs.logmsg(sev, '%s:%s: %s' %
                   (pbs.event().hook_name, caller_name(2 + caller), mes))
Beispiel #16
0
def test_group_membership(permitted_groups):
    import pbs
    import sys

    #  PBS_EXEC = '/opt/pbs/default'
    GETENT_CMD = '/bin/getent'

    # Main
    #  sys.path.append(PBS_EXEC + '/python/lib/python2.7')
    #  sys.path.append(PBS_EXEC + '/python/lib/python2.7/lib-dynload')
    from subprocess import Popen, PIPE
    from sets import Set

    e = pbs.event()
    j = e.job

    # Get the username
    who = str(e.requestor)

    # Build a list of users from all permitted groups
    for g in permitted_groups:
        output = Popen([GETENT_CMD, "group", g],
                       stdout=PIPE).communicate()[0].strip()
        output = output.split(':')[-1].split(',')
        if who in output:
            return True

    return False
Beispiel #17
0
def handle_execjob_begin():
    """
    Handler for execjob_begin events.
    """
    log_function_name()
    event = pbs.event()
    jid = event.job.id
    uid = pwd.getpwnam(event.job.euser).pw_uid
    log_with_caller(pbs.EVENT_DEBUG4, 'UID is %d' % uid)
    data = {
        'jobid': jid,
        'uid': uid
    }
    url = HookHelper.build_path(resource='job')
    timeout = HookHelper.get_config()['post_timeout']
    try:
        r = post(url, json=data, timeout=timeout)
        r.raise_for_status()
    except requests.Timeout:
        log_with_caller(pbs.EVENT_ERROR, 'POST timed out')
        raise OfflineError('Job POST timed out')
    except requests.HTTPError:
        if r.status_code == 400:
            retry_post(data)
        else:
            log_with_caller(pbs.EVENT_ERROR, 'Invalid status code %d' %
                            r.status_code)
            raise OfflineError('Job POST encountered invalid status code')
    log_with_caller(pbs.EVENT_DEBUG, 'Job %s registered' % jid)
Beispiel #18
0
 def connect(self, endpoint=None, port=None, job=None):
     self._check_pmi()
     if job is None:
         try:
             job = pbs.event().job
         except EventIncompatibleError:
             pass
     return self.__pmi._connect(endpoint, port, job)
Beispiel #19
0
 def connect(self, endpoint=None, port=None, job=None):
     self._check_pmi()
     if job is None:
         try:
             job = pbs.event().job
         except EventIncompatibleError:
             pass
     return self.__pmi._connect(endpoint, port, job)
Beispiel #20
0
 def disconnect(self, job=None):
     self._check_pmi()
     if job is None:
         try:
             job = pbs.event().job
         except EventIncompatibleError:
             pass
     return self.__pmi._disconnect(job)
Beispiel #21
0
 def disconnect(self, job=None):
     self._check_pmi()
     if job is None:
         try:
             job = pbs.event().job
         except EventIncompatibleError:
             pass
     return self.__pmi._disconnect(job)
Beispiel #22
0
    def ContinueChk(self,status,comment=''):
        if isinstance(status,list):
            comment = str(status[1])
            status = status[0].lower()
        elif isinstance(status,bool) != True:
            status = status.lower()

        # Check to see how to handle the status
        pbs.logmsg(pbs.EVENT_DEBUG3,'Status: %s\tComment: %s'%(status,comment))
        if status == False:
            return False
        elif status == 'warn':
            pbs.logmsg(pbs.EVENT_DEBUG,'WARNING: %s'%comment)
            return True
        elif status == 'offline' or status == 'reboot':
            pbs.logmsg(pbs.EVENT_DEBUG,"Status: %s\tComment: %s"%(status,comment))
            # Get the node, offline it, 
            pbs.logmsg(pbs.EVENT_DEBUG,"Offline node: %s"%(self.host))
            myvnode = pbs.event().vnode_list[self.host]
            myvnode.state = pbs.ND_OFFLINE
            pbs.logmsg(pbs.EVENT_DEBUG,"Offline node type: %s, comment: %s"%(type(str(comment)),comment))
            myvnode.comment =  "-attn_nhc: "+comment
            #pbs.logmsg(pbs.EVENT_DEBUG,"restart scheduler: %s %s"%(self.host,repr(myvnode.state)))
            #pbs.server().scheduler_restart_cycle()

            # Check to see if the node should be rebooted
            if status == 'reboot':
                pbs.logmsg(pbs.EVENT_DEBUG,"Comment: %s\nOfflined node: %s and rebooted"%(comment,self.host))
                pbs.event().job.rerun()
                pbs.reboot('reboot')

                # Run this command if the node is rebooted
                # The event().reject function ends the script
                pbs.logmsg(pbs.EVENT_DEBUG,"Comment: %s\nOfflined node: %s and restarted scheduling cycle"%(comment,self.host))
                pbs.event().reject("Offlined node, sent the reboot signal, and restarted scheduling cycle")

            # Reject the job
            pbs.event().reject("Offlined node and restarted scheduling cycle")
                    
        elif status == 'online':
            pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(self.host))
            mynodename = pbs.get_local_nodename()
            myvnode = pbs.event().vnode_list[mynodename]
            mynodename = pbs.get_local_nodename()
            pbs.logmsg(pbs.EVENT_DEBUG3,"got node: %s"%(mynodename))
            myvnode.state = pbs.ND_FREE
            pbs.logmsg(pbs.EVENT_DEBUG,"Changed node state to ND_FREE: %s"%(mynodename))
            myvnode.comment =  None
            pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(mynodename))
            
        else:
            return True
Beispiel #23
0
def GetReqGpus():
    sel = repr(pbs.event().job.schedselect)
    req_gpus = 0
    for chunk in sel.split('+'):
        for c in chunk.split(":"):
            kv = c.split("=")
            if (kv[0] == "ngpus"):
                req_gpus = kv[1]
                pbs.logmsg(pbs.LOG_DEBUG,
                           "Selected ngpus = value=%s" % (str(kv[1])))
    return req_gpus
def pbs_expandrange(seq):
    # Deal with the ranges first
    range = list(seq.partition('-'))

    lower = int(range[0])
    upper = int(range[2])

    if (upper <= lower ):
        pbs.logmsg(pbs.LOG_ERROR, "translate mpp: ERROR: bad range '%s', " \
                  "the first number (%d) must be less than the " \
                  "second number (%d)" %(seq, lower, upper))
        errstr="Bad range '%s', the first number (%d) must be less than " \
               "second number (%d)" %(seq, lower, upper)
        pbs.event().reject("The following error was encountered: \n"+errstr)
    expandedlist=list()
    while (lower <= upper):
        expandedlist.append(str(lower))
        lower=lower+1

    return expandedlist 
Beispiel #25
0
    def stderr(self, msg):
        """Write msg to appropriate file handle for stdout"""
        import sys

        try:
            if not pbs.event().job.interactive and pbs.event().job.in_ms_mom():
                logfile = open(self.stderr_log, 'ab+')
            else:
                logfile = sys.stderr

            if DEBUG:
                pbs.logmsg(
                    pbs.EVENT_DEBUG3, '%s;%s;[DEBUG3]: writing %s to %s' %
                    (pbs.event().hook_name, pbs.event().job.id, repr(msg),
                     logfile.name))

            logfile.write(msg)
            logfile.flush()
            logfile.close()
        except IOError:
            trace_hook()
Beispiel #26
0
def fixup_mpiprocs_ompthreads( sel ):
	selstr = repr(pbs.event().job.Resource_List["select"])

	if "mpiprocs" not in sel and "ompthreads" not in sel:
		mpiprocs   = int(sel["ncpus"])
		ompthreads = 1
		pbs.event().job.Resource_List["select"] = pbs.select( selstr + ":mpiprocs=" + str(mpiprocs ) + ":ompthreads=" + str(ompthreads) )
	elif "mpiprocs" not in sel and "ompthreads" in sel:
		ompthreads = int(sel["ompthreads"])
		mpiprocs   = ( sel["ncpus"] / ompthreads )
		if mpiprocs < 1:
			mpiprocs = 1
		pbs.event().job.Resource_List["select"] = pbs.select( selstr + ":mpiprocs=" + str(mpiprocs ) )
		# Add mpiprocs = ncpus / ompthreads
	elif "mpiprocs" in sel and "ompthreads" not in sel:
		mpiprocs   = int( sel["mpiprocs"] )
		ompthreads = int( sel["ncpus"] ) / mpiprocs 
		if ompthreads < 1:
			ompthreads =1
		pbs.event().job.Resource_List["select"] = pbs.select( selstr + ":ompthreads=" + str(ompthreads ) )
	else:
			mpiprocs  = int(sel["mpiprocs"])
			ompthreads= int(sel["ompthreads"])
			if (mpiprocs * ompthreads) != int(sel["ncpus"]):
				pbs.event().reject( "mpiprocs * ompthreads must equal ncpus" )
Beispiel #27
0
    def __init__(self, pbs_event):
        self.hook_events = {
            pbs.EXECHOST_STARTUP: self.__setallresources_handler,
            pbs.EXECHOST_PERIODIC: self.__setallresources_handler,
        }
        self.e = pbs_event
        self.vnl = pbs.event().vnode_list
        self.local_node = pbs.get_local_nodename()

        if self.vnl == None or self.local_node == None:
            pbs.logmsg(pbs.EVENT_DEBUG,
                       "%s, failed to get local_node or vnl" % self.hook_name)
            self.e.accept()
Beispiel #28
0
def execjob_launch():

    try:
        if pbs.event().env["PBS_TASKNUM"] != "1":
            pbs.logmsg(
                pbs.LOG_DEBUG,
                "Not the first task, so not setting CUDA_VISIBLE_DEVICES")
            return
    except:
        pbs.logmsg(pbs.LOG_DEBUG, "Exception in getting PBS_TASKNUM from env")

    job = pbs.event().job.id
    vn = pbs.event().vnode_list
    req_gpus = GetReqGpus()

    if req_gpus:
        cuda_visible_devices = ""
        available_gpus = [
            i for i in range(0, numGpusOnHost())
        ]  # pbs.server().vnode(local_node).resources_available['ngpus']) ]
        ## Check the gpus already assigned on the node
        used_gpus = GetUsedGpus(GpuJobsPath)
        pbs.logmsg(pbs.LOG_DEBUG, "Used GPUs = %s" % (used_gpus))
        if used_gpus:
            available_gpus = [
                item for item in available_gpus if item not in used_gpus
            ]
            pbs.logmsg(pbs.LOG_DEBUG,
                       "GPUs available for assignment %s" % (available_gpus))
        for i in range(int(req_gpus)):
            if cuda_visible_devices != "":
                cuda_visible_devices += "\\,"
            cuda_visible_devices += str(available_gpus.pop(0))
        value = cuda_visible_devices
        pbs.logmsg(pbs.LOG_DEBUG,
                   "The Cuda visible devices is  ==> %s" % (value))
        pbs.event().env['CUDA_VISIBLE_DEVICES'] = str(value)
        UpdateGpuJobs(job, value, GpuJobsPath, add=True)
Beispiel #29
0
def exechost_periodic():
    vn = pbs.event().vnode_list
    JobsOnNode = getJobs()
    JobsOnFile = GetJobsInFile(GpuJobsPath)
    pbs.logmsg(pbs.LOG_DEBUG, "Jobs in File %s" % (JobsOnFile))
    for job in JobsOnFile:
        if not (str(job) in JobsOnNode.keys()):
            # Remove this job
            value = ""
            UpdateGpuJobs(job, value, GpuJobsPath, add=False)
            pbs.logmsg(
                pbs.LOG_DEBUG,
                "Removing Job %s from file %s --> The job is not running on the node"
                % (job, GpuJobsPath))
Beispiel #30
0
    def stderr(self, msg):
        """Write msg to appropriate file handle for stdout"""
        import sys

        try:
            if not pbs.event().job.interactive and pbs.event().job.in_ms_mom():
                logfile=open(self.stderr_log, 'ab+')
            else:
                logfile=sys.stderr

            if DEBUG: 
                pbs.logmsg(pbs.EVENT_DEBUG3, 
                    '%s;%s;[DEBUG3]: writing %s to %s' %
                        (pbs.event().hook_name, 
                         pbs.event().job.id, 
                         repr(msg), 
                         logfile.name))

            logfile.write(msg)
            logfile.flush()
            logfile.close()
        except IOError:
            trace_hook()
Beispiel #31
0
def check_express_project_code():
	project = pbs.event().job.project
	if not project:
		pbs.event().reject( "You must specify an express code with -P when submitting express jobs" )

	project = repr(project)
	if not re.match("^exp-[a-z0-9]+$", project ):
		pbs.event().reject( "Invalid express code: these have the format 'exp-XXXX'" )
	if not test_group_membership( [ project ] ):
		pbs.event().reject( "You are not authorised to use this express code" )

	try:
		import requests
		r = requests.get( "https://api.rcs.imperial.ac.uk/v1.0/express/%s/enabled" % ( project, ) )
		if (r.status_code == 200) and (r.text != "1"):
			pbs.event().reject("This express code is not enabled. Please contact [email protected]" )
	except :#
#   pbs.event().reject("Exception checking express enabled " )
		pass

	return project
Beispiel #32
0
def extract_selection():
    if ("select" not in pbs.event().job.Resource_List) or (
            pbs.event().job.Resource_List["select"] == None):
        pbs.event().reject(
            "You must specify a resource selection using the format\n      -lselect=N:ncpus=X:mem=Ygb"
        )

    select = repr(pbs.event().job.Resource_List["select"])
    select = select.split("+")

    if len(select) > 1:
        pbs.event().reject("Only one -lselect is permitted.")

    chunk = select[0]

    ret = dict()
    try:
        nodect = 0
        chunk = chunk.split(":")
        nodect = int(chunk[0])

        ret["nodect"] = nodect

        for rs in chunk[1:]:
            key = rs.split("=")[0]
            val = rs.split("=")[1]
            if key not in list_of_resources:
                pbs.event().reject("Resource [" + key +
                                   "] not permitted in -lselect.")
            # Try converting the value to an integer if it happens to be one
            try:
                val = int(val)
            except:
                pass
            ret[key] = val
    except:
        pass
    return ret
Beispiel #33
0
def retry_post(data):
    """
    In the case where a POST fails due to a 400 error,
    it could be because there is already a job on the cray side.
    In that case, we should try to delete the existing job and
    resubmit a new one.

    If a previous POST timedout so we rejected it, but the service
    just took too long to respond, it would exist on the service.
    """
    event = pbs.event()
    jid = event.job.id

    joburl = HookHelper.build_path(resource='job', jobid=jid)
    del_timeout = HookHelper.get_config()['delete_timeout']
    try:
        r_del = delete(joburl, timeout=del_timeout)
        r_del.raise_for_status()
    except requests.Timeout:
        log_with_caller(pbs.EVENT_ERROR, 'DELETE timed out')
        raise OfflineError('Job delete timed out')
    except requests.HTTPError:
        # If 404, then maybe the job that was there is now gone,
        # try posting again. Otherwise, raise an OfflineError
        if r_del.status_code != 404:
            log_with_caller(pbs.EVENT_ERROR, 'DELETE job failed')
            raise OfflineError('Job delete failed')

    url = HookHelper.build_path(resource='job')
    post_timeout = HookHelper.get_config()['post_timeout']
    try:
        r_post = post(url, json=data, timeout=post_timeout)
        r_post.raise_for_status()
    except requests.Timeout:
        log_with_caller(pbs.EVENT_ERROR, 'POST timed out')
        raise OfflineError('Job POST timed out')
    except requests.HTTPError:
        log_with_caller(pbs.EVENT_ERROR,
                        'Invalid status code %d' % r_post.status_code)
        raise OfflineError('Job POST encountered invalid status code')

    # if we got here, we've successfully deleted and re-posted the job
    log_with_caller(pbs.EVENT_DEBUG, 'Job %s registered' % jid)
    return
Beispiel #34
0
    def _get_usage(self, job):
        pbs.logmsg(pbs.EVENT_DEBUG3, "Cray: %s get_usage" % (job.id))
        try:
            f = open(energy_file(job), "r")
            start = int(f.read())
            f.close()
        except Exception:
            return None

        e = pbs.event()
        if e.type == pbs.EXECHOST_PERIODIC:
            # This function will be called for each job in turn when
            # running from a periodic hook.  Here we fill in some
            # global variables just once and use the information
            # for each job in turn.  Save the result of calling capmc
            # for all running jobs in the variable ninfo.  Keep a
            # dictionary with the job id's as keys holding a set
            # of nid numbers.
            if Pmi.ninfo is None:
                allnids = set()
                for jobid in e.job_list.keys():
                    j = e.job_list[jobid]
                    nidset = jobnids(j)
                    allnids.update(nidset)
                    Pmi.nidarray[jobid] = nidset
                nids, cnt = nidlist(None, allnids)
                Pmi.ninfo = node_energy("all", nids, cnt)
            nidset = Pmi.nidarray[job.id]
            energy = None
            if Pmi.ninfo is not None and "nodes" in Pmi.ninfo:
                energy = 0
                for node in Pmi.ninfo["nodes"]:
                    if node["nid"] in nidset:		# owned by job of interest
                        energy += node["energy_ctr"]
                pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" %
                              energy)
        else:
            nids, cnt = nidlist(job)
            energy = job_energy(job, nids, cnt)
        if energy is not None:
            return float(energy - start) / 3600000.0
        else:
            return None
Beispiel #35
0
    def _get_usage(self, job):
        pbs.logjobmsg(job.id, "Cray: get_usage")
        try:
            f = open(energy_file(job), "r")
            start = int(f.read())
            f.close()
        except Exception:
            return None

        e = pbs.event()
        if e.type == pbs.EXECHOST_PERIODIC:
            # This function will be called for each job in turn when
            # running from a periodic hook.  Here we fill in some
            # global variables just once and use the information
            # for each job in turn.  Save the result of calling capmc
            # for all running jobs in the variable ninfo.  Keep a
            # dictionary with the job id's as keys holding a set
            # of nid numbers.
            if Pmi.ninfo is None:
                allnids = set()
                for jobid in e.job_list.keys():
                    j = e.job_list[jobid]
                    nidset = jobnids(j)
                    allnids.update(nidset)
                    Pmi.nidarray[jobid] = nidset
                nids, cnt = nidlist(None, allnids)
                Pmi.ninfo = node_energy("all", nids, cnt)
            nidset = Pmi.nidarray[job.id]
            energy = None
            if Pmi.ninfo is not None and "nodes" in Pmi.ninfo:
                energy = 0
                for node in Pmi.ninfo["nodes"]:
                    if node["nid"] in nidset:		# owned by job of interest
                        energy += node["energy_ctr"]
                pbs.logjobmsg(job.id, "Cray: get_usage: energy %dJ" %
                              energy)
        else:
            nids, cnt = nidlist(job)
            energy = job_energy(job, nids, cnt)
        if energy is not None:
            return float(energy - start) / 3600000.0
        else:
            return None
Beispiel #36
0
def handle_execjob_end():
    """
    Handler for execjob_end events.
    """
    log_function_name()
    jid = pbs.event().job.id
    url = HookHelper.build_path(resource='job', jobid=jid)
    timeout = HookHelper.get_config()['delete_timeout']
    try:
        r = delete(url, timeout=timeout)
        r.raise_for_status()
    except requests.Timeout:
        log_with_caller(pbs.EVENT_ERROR, 'DELETE timed out')
        raise RejectError('Job delete timed out')
    except requests.HTTPError:
        log_with_caller(pbs.EVENT_ERROR, 'DELETE job failed')
        raise RejectError('Job delete failed')

    log_with_caller(pbs.EVENT_DEBUG, 'Job %s deleted' % jid)
Beispiel #37
0
def jobobit_hook():
    import pbs
    import sys

    try:
        e = pbs.event()
        job = e.job
        pbs.logjobmsg(job.id,
                      'jobobit hook started for test %s' % (e.hook_name, ))
        pbs.logjobmsg(job.id, 'jobobit hook, job starttime:%s' % (job.stime, ))
        pbs.logjobmsg(job.id,
                      'jobobit hook, job obittime:%s' % (job.obittime, ))
        pbs.logjobmsg(job.id, 'jobobit hook, job_state=%s' % (job.job_state, ))
        pbs.logjobmsg(job.id,
                      'jobobit hook, job_substate=%s' % (job.substate, ))
        state_desc = pbs.REVERSE_JOB_STATE.get(job.job_state, '(None)')
        substate_desc = pbs.REVERSE_JOB_SUBSTATE.get(job.substate, '(None)')
        pbs.logjobmsg(job.id,
                      'jobobit hook, job_state_desc=%s' % (state_desc, ))
        pbs.logjobmsg(job.id,
                      'jobobit hook, job_substate_desc=%s' % (substate_desc, ))
        if hasattr(job, "resv") and job.resv:
            pbs.logjobmsg(job.id,
                          'jobobit hook, resv:%s' % (job.resv.resvid, ))
            pbs.logjobmsg(
                job.id,
                'jobobit hook, resv_nodes:%s' % (job.resv.resv_nodes, ))
            pbs.logjobmsg(
                job.id,
                'jobobit hook, resv_state:%s' % (job.resv.reserve_state, ))
        else:
            pbs.logjobmsg(job.id, 'jobobit hook, resv:(None)')
        pbs.logjobmsg(job.id,
                      'jobobit hook finished for test %s' % (e.hook_name, ))
    except Exception as err:
        ty, _, tb = sys.exc_info()
        pbs.logmsg(
            pbs.LOG_DEBUG,
            str(ty) + str(tb.tb_frame.f_code.co_filename) + str(tb.tb_lineno))
        e.reject()
    else:
        e.accept()
Beispiel #38
0
def extract_queue_type():

    if pbs.event().job.queue == "" or pbs.event().job.queue == None:
        return "common"

    if (pbs.event().job.queue != ""):
        queue_name = pbs.event().job.queue.name

    if queue_name == "express":  # len(queue_name)>0 and queue_name[0] == "e":
        return "express"
    elif len(queue_name) > 0 and (queue_name[0] == "p" or queue_name
                                  == "med-bio" or queue_name == "viz"):
        return "private"
    elif queue_name.startswith(queue_config_version):
        queue_name = re.sub("^" + queue_config_version, "", queue_name)
        return "common:" + queue_name
    elif queue_name == "gpgpu":
        pbs.event().reject(
            "-q gpgpu no longer required. Please submit without a queue qualification"
        )
    else:
        pbs.event().reject("Unknown queue name.")
Beispiel #39
0
def parse_config_file():
    # Turn everything off by default. These settings be modified
    # when the configuration file is read.
    global pbs_home
    global pbs_exec
    global power_ramp_rate_enable
    global power_on_off_enable
    global node_idle_limit
    global min_node_down_delay
    global max_jobs_analyze_limit
    global max_concurrent_nodes

    try:
        # This block will work for PBS Pro versions 13 and later
        pbs_conf = pbs.get_pbs_conf()
        pbs_home = pbs_conf['PBS_HOME']
        pbs_exec = pbs_conf['PBS_EXEC']
    except:
        pbs.logmsg(pbs.EVENT_DEBUG,
                   "PBS_HOME needs to be defined in the config file")
        pbs.logmsg(pbs.EVENT_DEBUG, "Exiting the power hook")
        pbs.event().accept()

    # Identify the config file and read in the data
    config_file = ''
    if 'PBS_HOOK_CONFIG_FILE' in os.environ:
        config_file = os.environ["PBS_HOOK_CONFIG_FILE"]
    tmpcfg = ''
    if not config_file:
        tmpcfg = os.path.join(pbs_home, 'server_priv', 'hooks',
                              'PBS_power.CF')
    if os.path.isfile(tmpcfg):
        config_file = tmpcfg
    if not config_file:
        tmpcfg = os.path.join(pbs_home, 'mom_priv', 'hooks',
                              'PBS_power.CF')
    if os.path.isfile(tmpcfg):
        config_file = tmpcfg
    if not config_file:
        raise Exception("Config file not found")
    pbs.logmsg(pbs.EVENT_DEBUG3, "Config file is %s" % config_file)
    try:
        fd = open(config_file, 'r')
        config = json.load(fd)
        fd.close()
    except IOError:
        raise Exception("I/O error reading config file")
    except:
        raise Exception("Error reading config file")

    # Assign default values to attributes
    power_ramp_rate_enable = False
    power_on_off_enable = False
    node_idle_limit = 1800
    min_node_down_delay = 1800
    max_jobs_analyze_limit = 100
    max_concurrent_nodes = 10

    # Now assgin values read from config file
    if 'power_on_off_enable' in config:
        power_on_off_enable = config['power_on_off_enable']
        pbs.logmsg(pbs.EVENT_DEBUG3, "power_on_off_enable is set to %s" %
                   str(power_on_off_enable))
    if 'power_ramp_rate_enable' in config:
        power_ramp_rate_enable = config['power_ramp_rate_enable']
        pbs.logmsg(pbs.EVENT_DEBUG3, "power_ramp_rate_enable is set to %s" %
                   str(power_ramp_rate_enable))
    if 'node_idle_limit' in config:
        node_idle_limit = int(config['node_idle_limit'])
        if not node_idle_limit or node_idle_limit < 0:
            node_idle_limit = 1800
        pbs.logmsg(pbs.EVENT_DEBUG3, "node_idle_limit is set to %d" %
                   node_idle_limit)
    if 'min_node_down_delay' in config:
        min_node_down_delay = int(config['min_node_down_delay'])
        if not min_node_down_delay or min_node_down_delay < 0:
            min_node_down_delay = 1800
        pbs.logmsg(pbs.EVENT_DEBUG3, "min_node_down_delay is set to %d" %
                   min_node_down_delay)
    if 'max_jobs_analyze_limit' in config:
        max_jobs_analyze_limit = int(config['max_jobs_analyze_limit'])
        if not max_jobs_analyze_limit or max_jobs_analyze_limit < 0:
            max_jobs_analyze_limit = 100
        pbs.logmsg(pbs.EVENT_DEBUG3, "max_jobs_analyze_limit is set to %d" %
                   max_jobs_analyze_limit)
    if 'max_concurrent_nodes' in config:
        max_concurrent_nodes = int(config['max_concurrent_nodes'])
        if not max_concurrent_nodes or max_concurrent_nodes < 0:
            max_concurrent_nodes = 10
        pbs.logmsg(pbs.EVENT_DEBUG3, "max_concurrent_nodes is set to %d" %
                   max_concurrent_nodes)
Beispiel #40
0
    if 'max_jobs_analyze_limit' in config:
        max_jobs_analyze_limit = int(config['max_jobs_analyze_limit'])
        if not max_jobs_analyze_limit or max_jobs_analyze_limit < 0:
            max_jobs_analyze_limit = 100
        pbs.logmsg(pbs.EVENT_DEBUG3, "max_jobs_analyze_limit is set to %d" %
                   max_jobs_analyze_limit)
    if 'max_concurrent_nodes' in config:
        max_concurrent_nodes = int(config['max_concurrent_nodes'])
        if not max_concurrent_nodes or max_concurrent_nodes < 0:
            max_concurrent_nodes = 10
        pbs.logmsg(pbs.EVENT_DEBUG3, "max_concurrent_nodes is set to %d" %
                   max_concurrent_nodes)


# Accept if event not serviceable.
this_event = pbs.event()
if this_event.type not in [pbs.EXECJOB_PROLOGUE, pbs.EXECJOB_EPILOGUE,
                           pbs.EXECJOB_BEGIN, pbs.EXECJOB_END,
                           pbs.EXECHOST_STARTUP, pbs.EXECHOST_PERIODIC,
                           pbs.PERIODIC]:
    pbs.logmsg(pbs.LOG_WARNING,
               "Event not serviceable for power provisioning.")
    this_event.accept()


if this_event.type == pbs.PERIODIC:
    vnlist = this_event.vnode_list
    resvlist = this_event.resv_list
    time_now = time.time()

    # Parse the config file for power attributes
Beispiel #41
0
def trace_hook(**kwargs):
    """Simple exception trace logger for PBS hooks
    loglevel=<int> (pbs.LOG_DEBUG): log level to pass to pbs.logmsg()
    reject=True: reject the job upon completion of logging trace
    trace_in_reject=<bool> (False): pass trace to pbs.event().reject()
    trace_in_reject=<str>: message to pass to pbs.event().reject() with trace
    """
    import sys
    
    if 'loglevel' in kwargs: 
        loglevel=kwargs['loglevel']
    else: 
        loglevel=pbs.LOG_ERROR
    if 'reject' in kwargs: 
        reject=kwargs['reject']
    else: 
        reject=True
    if 'trace_in_reject' in kwargs: 
        trace_in_reject=kwargs['trace_in_reject']
    else: 
        trace_in_reject=False

    # Associate hook events with the appropriate PBS constant. This is a list
    # of all hook events as of PBS Pro 13.0. If the event does not exist, it is
    # removed from the list.
    hook_events=['queuejob', 'modifyjob', 'movejob', 'runjob', 'execjob_begin',
                 'execjob_prologue', 'execjob_launch', 'execjob_attach', 
                 'execjob_preterm', 'execjob_epilogue', 'execjob_end', 
                 'resvsub', 'provision', 'exechost_periodic', 
                 'exechost_startup']

    hook_event={}
    for he in hook_events:
        # Only set available hooks for the current version of PBS.
        if hasattr(pbs, he.upper()):
            event_code=eval('pbs.'+he.upper())
            hook_event[event_code]=he
            hook_event[he]=event_code
            hook_event[he.upper()]=event_code
            del event_code
        else:
            del hook_events[hook_events.index(he)]

    trace={
        'line':      sys.exc_info()[2].tb_lineno,
        'module':    sys.exc_info()[2].tb_frame.f_code.co_name,
        'exception': sys.exc_info()[0].__name__,
        'message':   sys.exc_info()[1].message,
    }
    tracemsg='%s hook %s encountered an exception: Line %s in %s %s: %s' %(
        hook_event[pbs.event().type], pbs.event().hook_name,
        trace['line'], trace['module'], trace['exception'], trace['message']
    )
    rejectmsg="Hook Error: request rejected as filter hook '%s' encountered " \
        "an exception. Please inform Admin" % pbs.event().hook_name
    if not isinstance(loglevel, int):
        loglevel=pbs.LOG_ERROR
        tracemsg='trace_hook() called with invalid argument (loglevel=%s), '\
            'setting to pbs.LOG_ERROR. ' + tracemsg

    pbs.logmsg(pbs.LOG_ERROR, tracemsg)

    if reject:
        tracemsg+=', request rejected'
        if isinstance(trace_in_reject, bool):
            if trace_in_reject: 
                pbs.event().reject(tracemsg)
            else: 
                pbs.event().reject(rejectmsg)
        else: 
            pbs.event().reject(str(trace_in_reject)+'Line %s in %s %s:\n%s' % (
                trace['line'],trace['module'],trace['exception'],
                trace['message'] ))
Beispiel #42
0
# resides.
#
def get_filesystem_avail_unprivileged( dirname ):
    o = os.statvfs(dirname)
    return pbs.size( "%skb" % ((o.f_bsize * o.f_bavail) / 1024) )

# get_filesystem_avail_privileged: returns available size in kbytes
# (in pbs.size type) to privileged users, of the filesystem where 'dirname'
# resides.
#
def get_filesystem_avail_privileged( dirname ):
    o = os.statvfs(dirname)
    return pbs.size( "%skb" % ((o.f_bsize * o.f_bfree) / 1024) )


# Define here the custom resources as key, and the function and its argument
# for obtaining the value of the custom resource:
#    Format: dyn_res[<resource_name>] = [<function_name>, <function_argument>]
# So "<function_name>(<function_argument>)" is called to return the value
# for custom <resource_name>.
dyn_res = {}
dyn_res["scratch"] = [get_filesystem_avail_unprivileged, "/tmp"]
dyn_res["home"]    = [get_filesystem_avail_unprivileged, "/home"]

vnl = pbs.event().vnode_list
local_node = pbs.get_local_nodename()

for k in dyn_res.keys():
    vnl[local_node].resources_available[k] = dyn_res[k][0](dyn_res[k][1])

Beispiel #43
0
 def get_usage(self, job=None):
     self._check_pmi()
     if job is None:
         job = pbs.event().job
     return self.__pmi._get_usage(job)
Beispiel #44
0
Datei: test.py Projekt: Wukie/SNS
# (C) Alberto Coduti 2019

import pbs
import sys

try:
	if pbs.event().requestor == "a.coduti":
		print pbs.event().requestor
		pbs.event().accept()
	else:
		pbs.event().reject("Tu non sei Albè!")
except SystemExit:
	pass
except:
	pbs.event().reject("Exception")
Beispiel #45
0
# For a copy of the commercial license terms and conditions,
# go to: (http://www.pbspro.com/UserArea/agreement.html)
# or contact the Altair Legal Department.
#
# Altair’s dual-license business model allows companies, individuals, and
# organizations to create proprietary derivative works of PBS Pro and
# distribute them - whether embedded or bundled with other software -
# under a commercial license agreement.
#
# Use of Altair’s trademarks, including but not limited to "PBS™",
# "PBS Professional®", and "PBS Pro™" and Altair’s logos is subject to Altair's
# trademark licensing policies.

import pbs

e = pbs.event()
if e.type == pbs.RESVSUB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is resvsub")
elif e.type == pbs.RESV_END:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is resv_end")
elif e.type == pbs.QUEUEJOB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is queuejob")
elif e.type == pbs.MODIFYJOB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is modifyjob")
elif e.type == pbs.MOVEJOB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is movejob")
elif e.type == pbs.RUNJOB:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is runjob")
elif e.type == pbs.PERIODIC:
    pbs.logmsg(pbs.LOG_DEBUG, "hook event type is periodic")
elif e.type == pbs.EXECJOB_BEGIN:
Beispiel #46
0
'''


# Define the version
__version__ = '0.0.2'


import sys
import os
import json

try:
    import pbs

    # Remember, periodic events do not have a job associated to them.
    if pbs.event().type != pbs.EXECHOST_PERIODIC:
        who = pbs.event().job.euser

# For limiting testing to 1 user's jobs, uncomment this and change username
#        pbs.logmsg(pbs.EVENT_DEBUG3,'User: %s'%who)
#        if who != 'jshelley':
#            pbs.logmsg(pbs.EVENT_DEBUG,'jshelley != %s'%who)
#            pbs.event().accept()

    pbs.logmsg(pbs.EVENT_DEBUG3,'Event: %s'%pbs.event().type)
    
    # Add the site-packages paths to the sys path
    pbs_conf = pbs.pbs_conf
#    py_path = '/opt/pbs/default/python/lib'
    py_path = pbs_conf['PBS_EXEC']+os.sep+'python/lib'
Beispiel #47
0
    def ChkTouchFileAsUser(self):
        if self.nhc_cfg["as_user_operations"]["check"] == False: 
            pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping touch file as user check")
            return True


        for file_dir in self.nhc_cfg["as_user_operations"]["touch_files"]:
            file_dir_orig=file_dir
            # Check to see if this is a periodic hook. If so skip pbsuser file touches
            if pbs.event().type == pbs.EXECHOST_PERIODIC and self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0] == 'pbsuser':
                pbs.logmsg(pbs.EVENT_DEBUG3,"Skipping this check dir: %s, since this is a periodic hook"%file_dir)
                continue

#            pbs.logmsg(pbs.EVENT_DEBUG3,"Dir: %s\tUser: %s"%(file_dir,str(self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0])))
#            pbs.logmsg(pbs.EVENT_DEBUG3,"Job User: %s"%(self.user))

            try:
                new_file_dir = ''
                if file_dir.startswith('$') != -1:
                    # I need to flesh out how to best handle this.
                    # It will require looking through the job environment varilables
                    V = pbs.event().job.Variable_List
                    pbs.logmsg(pbs.EVENT_DEBUG3,"Type(V): %s"%(type(V)))
                    pbs.logmsg(pbs.EVENT_DEBUG3,"Job variable list: %s"%(V))
                    for var in V:
                        pbs.logmsg(pbs.EVENT_DEBUG3,"var: %s, file_dir: %s"%(var,file_dir))
                        pbs.logmsg(pbs.EVENT_DEBUG3,"V[var]: %s"%(V[var]))
                        if var.startswith(file_dir[1:]):
                            new_file_dir = V[var]
                            pbs.logmsg(pbs.EVENT_DEBUG3,"New dir: %s"%(file_dir))
                            break
                            
                    pass

                # Check to see what user this test should be run as.
                # Options: pbsuser or pbsadmin
                status = ''
                if self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0] == 'pbsadmin':
                    pbs.logmsg(pbs.EVENT_DEBUG3,"TouchFileAsAdmin: %s"%(file_dir))
                    if new_file_dir != '':
                        status = self.TouchFileAsUser('root',new_file_dir,file_dir_orig)
                    else:
                        status = self.TouchFileAsUser('root',file_dir,file_dir_orig)
                        
                elif self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0] == 'pbsuser':
                    # Check to see if check is to be written to a specific user dir
                    pbs.logmsg(pbs.EVENT_DEBUG3,"TouchFileAsUser: User: %s, Dir: %s"%(self.user,file_dir))
                    if file_dir.find('<userid>') != -1:
                        file_dir = file_dir.replace('<userid>',self.user)
                    
                    # Try to touch the file
                    if new_file_dir != '':
                        status = self.TouchFileAsUser(self.user,new_file_dir,file_dir_orig)
                    else:
                        status = self.TouchFileAsUser(self.user,file_dir,file_dir_orig)
                else:
                    pbs.logmsg(pbs.EVENT_DEBUG,"Unknown User: %s. Please specify either pbsadmin or pbsuser"% \
                                         (str(self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0])))
                    return [self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][1],"Unknown User: %s. Please specify either pbsadmin or pbsuser"% \
                                         (str(self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][0]))]

                if status != True:
                    return status
                    
            except OSError:
                return [self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][1],'Can not find file/dir: %s'%file_dir]
            except Exception, e:
                return [self.nhc_cfg["as_user_operations"]["touch_files"][file_dir_orig][1],'Encountered an error %s for file/dir: %s'%(e,file_dir)]
Beispiel #48
0
	conn.close()
	if result is not None:
		return float(result[0])
		pbs.logmsg(pbs.LOG_DEBUG, "---> getBudget: project %s has %s hours" % (project, result[0]))
	else:
		pbs.logmsg(pbs.LOG_DEBUG, "---> getBudget: project not found")
		return None

# Setting some messages
contact_msg = ", please contact Admins at [email protected] and report this message."

# Let's start!

try:
	# If it's a system user accept the job
	if pbs.event().requestor in ["PBS_Server", "Scheduler", "pbs_mom"]:
		pbs.event().accept()

	# Check if project has been set
	if pbs.event().job.project is not None:
		project = str(pbs.event().job.project)
	else:
		project = "_pbs_project_default"

	# Accept if it's default project
	# TODO change it in production
	if project == "_pbs_project_default":
		pbs.event().accept()

	pbs.logmsg(pbs.LOG_DEBUG, "---> Queuejob Hook Start! Requestor is %s and project is %s" % (pbs.event().requestor, project))
	pbs.logmsg(pbs.LOG_DEBUG, "---> select line is %s" % pbs.event().job.Resource_List.select)
Beispiel #49
0
def setBudget(project, budget):
	pbs.logmsg(pbs.LOG_DEBUG, "---> setBudget: " + str(project) + " set to " + str(budget))
	conn = psycopg2.connect(database="pbs_accounting", user = "******", password = "******", host = "mullis01.sns.it", port = "5432")
	cur = conn.cursor()
	cur.execute("UPDATE projects SET project_hours = %s WHERE project_name = %s;",(budget, project))
	conn.commit()
	cur.close()
	conn.close()

# Setting some messages
contact_msg = ", please contact Admins at [email protected] and report this message."

# Let's start!

try:
	project = str(pbs.event().job.project)
	myQueue = str(pbs.event().job.queue)
	
	# Accept if it's default project
	if project == "_pbs_project_default":
		pbs.event().accept()

	pbs.logmsg(pbs.LOG_DEBUG, "---> Runjob Hook Start! User is %s and project is %s" % (pbs.event().job.euser, project))
	pbs.logmsg(pbs.LOG_DEBUG, "---> select line is %s" % pbs.event().job.Resource_List.select)

	#
	ncpus = pbs.event().job.Resource_List.ncpus
	if ncpus is None:
		pbs.logmsg(pbs.LOG_DEBUG, "---> ncpus is None, parsing select line")
		select = str(pbs.event().job.Resource_List.select)
		if "ncpus=" in select:
Beispiel #50
0
# The following constants can be modified in run_pelog_shell.ini to match 
# site preferences.

ENABLE_PARALLEL=False
VERBOSE_USER_OUTPUT=False
DEFAULT_ACTION=RERUN
TORQUE_COMPAT=False

import pbs
import os, sys
import time

# Set up a few variables
start_time=time.time()
pbs_event=pbs.event()
hook_name=pbs_event.hook_name
hook_alarm=30 # default, we'll read it from the .HK later
DEBUG=False # default, we'll read it from the .HK later
job=pbs_event.job

# The trace_hook function has been written to be portable between hooks. 
def trace_hook(**kwargs):
    """Simple exception trace logger for PBS hooks
    loglevel=<int> (pbs.LOG_DEBUG): log level to pass to pbs.logmsg()
    reject=True: reject the job upon completion of logging trace
    trace_in_reject=<bool> (False): pass trace to pbs.event().reject()
    trace_in_reject=<str>: message to pass to pbs.event().reject() with trace
    """
    import sys
    
Beispiel #51
0
        event.reject(str(e))
    return power


def vnodes_enabled(job):
    # see if power operations are allowed on all job vnodes
    for vn in _get_vnode_names(job):
        if not _svr_vnode(vn).power_provisioning:
            pbs.logjobmsg(job.id,
                          "power functionality is disabled on vnode %s" % vn)
            return False
    return True


# Accept if event not serviceable.
this_event = pbs.event()
if this_event.type not in [pbs.EXECJOB_PROLOGUE, pbs.EXECJOB_EPILOGUE,
                           pbs.EXECJOB_BEGIN, pbs.EXECJOB_END,
                           pbs.EXECHOST_STARTUP, pbs.EXECHOST_PERIODIC]:
    pbs.logmsg(pbs.LOG_WARNING,
               "Event not serviceable for power provisioning.")
    this_event.accept()


# Set eoe values for my node
if this_event.type == pbs.EXECHOST_STARTUP:
    from pbs.v1._pmi_utils import _is_node_provisionable

    # Don't connect if the server or sched is running.
    if not _is_node_provisionable():
        pbs.logmsg(pbs.LOG_DEBUG,
Beispiel #52
0
eventsDict = {
1: "pbs.QUEUEJOB",
2: "pbs.MODIFYJOB",
4: "pbs.RESVSUB",
8: "pbs.MOVEJOB",
16: "pbs.RUNJOB",
32: "pbs.PROVISION",
64: "pbs.EXECJOB_BEGIN",
128: "pbs.EXECJOB_PROLOGUE",
256: "pbs.EXECJOB_EPILOGUE",
512: "pbs.EXECJOB_END",
1024: "pbs.EXECJOB_PRETERM",
4096: "pbs.EXECHOST_PERIODIC"
}

try:
	event_type = pbs.event().type
	user = str(pbs.event().requestor)
	jobID = pbs.event().job.id
	if event_type == 1:
		pbs.logmsg(pbs.LOG_DEBUG, "---> Hook called! Event pbs.QUEUEJOB by %s" % user)
	else:
		pbs.logmsg(pbs.LOG_DEBUG, "---> Hook called! Event %s by %s for job %s" % (eventsDict[event_type], user, jobID))
	pbs.event().accept()

except SystemExit:
	pass

except Exception, e:
	pbs.event().reject('Something went wrong, just got an Exception: %s' % str(e))
Beispiel #53
0
            mynode = pbs.event().vnode_list[pbs.get_local_nodename()]
            if mynode.power_provisioning:
                try:
                    profiles = self.__pmi._query(
                        pbs.Power.QUERY_PROFILE)
                    names = self._map_profile_names(profiles)
                    mynode.resources_available["eoe"] = names
                    pbs.logmsg(pbs.LOG_WARNING,
                               "PMI:activate: set eoe: %s" % names)
                except:
                    pass
            raise BackendError(e)
        except InternalError, e:
            # couldn't do activation so set vnode offline
            me = pbs.get_local_nodename()
            pbs.event().vnode_list[me].state += pbs.ND_OFFLINE
            pbs.logmsg(pbs.LOG_WARNING, "PMI:activate: set vnode offline")
            raise InternalError(e)

    def deactivate_profile(self, job=None):
        self._check_pmi()

        if job is None:
            job = pbs.event().job
        if _running_excl(job):
            pbs.logjobmsg(job.id, "PMI: reset current_eoe")
            for h in _get_vnode_names(job):
                try:
                    pbs.event().vnode_list[h].current_eoe = None
                except:
                    pass
Beispiel #54
0
#accounting_file = '/cm/shared/apps/pbspro/var/spool/avogadro_accounting.json'
accounting_file = 'projects.acct'

# Loading accounts dictionary from file
if os.path.isfile(accounting_file):
	accounts_dictionary = {}
	accountFile = open(accounting_file, 'r')
	for line in accountFile:
		temp = split(line, " = ")
		key = str(temp[0])
	        value = float(temp[1])
		accounts_dictionary.update({key: value})
	accountFile.close()
else:
	pbs.event().reject("No accounts database found")

# Let's start!

try:
	# If it's a system user accept the job
	if pbs.event().requestor in ["PBS_Server", "Scheduler", "pbs_mom"]:
		pbs.event().accept()

	# I'm setting these variables for readability
	if pbs.event().job.project is not None:
		project = str(pbs.event().job.project)
		project_budget = accounts_dictionary[project]
	else:
		pbs.event().reject("No project set (project=None), please contact Admins at [email protected] and report this message.")
Beispiel #55
0
# (C) Alberto Coduti 2019

import pbs, sys
import os
from string import split

#accounting_file = '/cm/shared/apps/pbspro/var/spool/avogadro_accounting.json'
accounting_file = 'test_avogadro_accounts'
hook_debug_file = str(pbs.event().job.Variable_List["PBS_O_WORKDIR"]) + '/hook_debug.log'

def durationToHours(duration):
	hours,minutes,seconds = str(duration).split(":")
	temp = float(hours) + (float(minutes)/60) + (float(seconds)/3600)
	return temp

def hoursToDuration(hours):
	hours = str(int(hours))
	minutes = str(int(hours*60

# Loading accounts dictionary from file
if os.path.isfile(accounting_file):
	accounts_dictionary = {}
	accountFile = open(accounting_file, 'r')
	for line in accountFile:
		temp = line.split(" = ")
		key = str(temp[0])
	        value = float(temp[1])
		accounts_dictionary.update({key: value})
	accountFile.close()
else:
	pbs.event().reject("No accounts database found")
Beispiel #56
0
# test_end_postgres.py
# by Alberto Coduti - Feb 2019

import pbs, sys
sys.path.append('/cm/shared/apps/pbspro/default/python/lib/python2.5/site-packages')
import psycopg2

try:
	pbs.logmsg(pbs.LOG_DEBUG, "---> EXECJOB_END Hook called")
	pbs.logmsg(pbs.LOG_DEBUG, "---> pbs.event().job.euser = " + str(pbs.event().job.euser))
	pbs.event().accept()

except SystemExit:
	pass

except Exception, e:
	pbs.event().reject('Something went wrong, just got an Exception: ' + str(e) + contact_msg)
Beispiel #57
0
        conn.close()

def walltimeToHours(walltime):
        hours,minutes,seconds = str(walltime).split(":")
        converted = float(hours) + (float(minutes)/60) + (float(seconds)/3600)
        pbs.logmsg(pbs.LOG_DEBUG, "---> Walltime %s converted to %s" % (walltime, converted))
        return converted

# Setting some messages
contact_msg = ", please contact Admins at [email protected] and report this message."

# Let's start!

try:
        # Check if project has been set
        if pbs.event().job.project is not None:
                project = str(pbs.event().job.project)
        else:
                project = "_pbs_project_default"

	# Accept if it's default project
	# TODO change it in production
	if project == "_pbs_project_default":
		pbs.event().accept()

	#
        myQueue = str(pbs.event().job.queue)
        if myQueue == "":
                pbs.event().reject("No queue selected, please select a queue")
        if myQueue == "workq":
                pbs.event().reject("Queue workq is not enabled")
Beispiel #58
0
# test_preterm.py
# by Alberto Coduti - Feb 2019

import pbs

try:
	pbs.logmsg(pbs.LOG_DEBUG, "---> EXECJOB_PRETERM Hook called")
	pbs.event().accept()

except SystemExit:
	pass

except Exception, e:
	pbs.event().reject('Something went wrong, just got an Exception: ' + str(e) + contact_msg)
Beispiel #59
0
# (C) Alberto Coduti 2019

import pbs, sys
import os
from string import split

#accounting_file = '/cm/shared/apps/pbspro/var/spool/avogadro_accounting.json'
accounting_file = 'test_avogadro_accounts'
hook_debug_file = str(pbs.event().job.Variable_List["PBS_O_WORKDIR"]) + '/hook_debug.log'

# Loading accounts dictionary from file
if os.path.isfile(accounting_file):
	accounts_dictionary = {}
	accountFile = open(accounting_file, 'r')
	for line in accountFile:
		temp = split(line, " = ")
		key = str(temp[0])
	        value = float(temp[1])
		accounts_dictionary.update({key: value})
	accountFile.close()
else:
	pbs.event().reject("No accounts database found")

# Let's start!

try:
	# I'm setting these variables for readability
	user = str(pbs.event().requestor)
	user_budget = int(accounts_dictionary[user])

	# If system user accept the job