class GlobalVars(): def __init__(self): self.pbs_query = PBSQuery() self.job_type = "" self.idle_jobs = [] self.total_jobs = 0.0 self.job_list_length = 0.0 self.current_time = mktime(strptime(ctime(time()))) # in seconds self.current_year = ctime(time()).split(" ")[4] self.system = self.pbs_query.get_server_name().partition("-")[0] self.job_server = self.pbs_query.get_server_name() self.word = sys.maxsize # double check word/gb conversions ... self.gb_from_mw = 1/(64.0**3) self.gb_from_kw = 1/(64.0**2) self.gb_from_w = 1/64.0 self.gb_from_mb = 1/1024.0 self.gb_from_kb = self.gb_from_mb/1024.0 self.gb_from_b = self.gb_from_kb/1024.0 self.available_licenses = self.find_available_licenses() self.showres = check_output(["showres"]).strip("\n").split("\n") def find_available_licenses(self): # Find all available licenses on the system query_flexlm = check_output(["/usr/local/sbin/query-flexlm"]).rstrip("\n").partition("ARES=")[2].split(",") self.available_licenses = {} for lic in query_flexlm: self.available_licenses[lic.partition(":")[0]] = int(lic.partition(":")[2])
def is_running(self, queue_id): """Must return True/False whether the job is in the queue or not respectively. Input: queue_id: Unique identifier for a job. Output: in_queue: Boolean value. True if the job identified by 'queue_id' is still running. """ batch = PBSQuery.PBSQuery().getjobs() return (queue_id in batch)
def get_nodes(jobs): p = PBSQuery.PBSQuery() nodes = list() for job in jobs: exec_hosts = p.getjob(job, ['exec_host']) if not exec_hosts or 'exec_host' not in exec_hosts: continue for exec_host in exec_hosts['exec_host'][0].split('+'): hostname = exec_host.split('/')[0] if hostname not in nodes: nodes.append(hostname) return nodes
def _get_current_notes(self, nodes): '''A function to retrieve the current message''' if ARGS_VERBOSE: _print('class:SaraNodes func:_get_current_notes input:%s' % str(nodes), file=sys.stderr) pbsq = PBSQuery.PBSQuery() rdict = dict() ## We are only intereseted in the note for node, properties in pbsq.getnodes(['note']).items(): if node in nodes and properties.has_key('note'): rdict[node] = properties['note'] return rdict
def main(job_id): '''Monitor a job_id and run job_cleanup when the elapsed_time is within exit_time seconds of the wall_time.''' p=PBSQuery.PBSQuery() if not p.getjob(job_id): raise Exception,'invalid job id %s.' % job_id job=p.getjob(job_id) wall_time=hhmmss_to_seconds(job[job_id]['Resource_List.walltime']) try: sleep_time=wall_time-options.exit_time except NameError: # Try to use the default in case watchdog is being used as a module. sleep_time=wall_time-exit_time print 'Watchdog sleeping for %i' % (sleep_time) time.sleep(sleep_time) job_cleanup() sys.exit
def print_list(self, args, options): ''' A method that is used for collecting all nodes with the state down, offline or unknown ''' p = PBSQuery.PBSQuery() if pbs.version_info >= ( 4,0,0 ): if self.obj_sara_nodes.verbose: print "Enabling new_data_structure for PBSQuery" p.new_data_structure() try: nodes = p.getnodes( ['state', 'note'] ) except PBSQuery.PBSError, detail: print "PBSQuery error: %s" %detail sys.exit(1)
def __init__(self, pbs_server=None): self.pbsq = None retry = 0 cached_exception = None while not self.pbsq and retry < _MAX_RETRY: try: self.pbsq = PBSQuery.PBSQuery(server=pbs_server) except PBSQuery.PBSError as e: cached_exception = e retry += 1 time.sleep(retry ** 2) if not self.pbsq: if cached_exception: raise civet_exceptions.CivetException(cached_exception.message) else: raise civet_exceptions.CivetException("Unable to instantiate PBSQuery instance, unknown error.") self.pbs_server = pbs_server
def __init__(self): self.pbs_query = PBSQuery() self.job_type = "" self.idle_jobs = [] self.total_jobs = 0.0 self.job_list_length = 0.0 self.current_time = mktime(strptime(ctime(time()))) # in seconds self.current_year = ctime(time()).split(" ")[4] self.system = self.pbs_query.get_server_name().partition("-")[0] self.job_server = self.pbs_query.get_server_name() self.word = sys.maxsize # double check word/gb conversions ... self.gb_from_mw = 1/(64.0**3) self.gb_from_kw = 1/(64.0**2) self.gb_from_w = 1/64.0 self.gb_from_mb = 1/1024.0 self.gb_from_kb = self.gb_from_mb/1024.0 self.gb_from_b = self.gb_from_kb/1024.0 self.available_licenses = self.find_available_licenses() self.showres = check_output(["showres"]).strip("\n").split("\n")
def status(self): """Return a tuple of number of jobs running and queued for the pipeline Inputs: None Outputs: running: The number of pipeline jobs currently marked as running by the queue manager. queued: The number of pipeline jobs currently marked as queued by the queue manager. """ numrunning = 0 numqueued = 0 batch = PBSQuery.PBSQuery().getjobs() for j in batch.keys(): if batch[j]['Job_Name'][0].startswith(self.job_basename): if 'R' in batch[j]['job_state']: numrunning += 1 elif 'Q' in batch[j]['job_state']: numqueued += 1 return (numrunning, numqueued)
def delete(self, queue_id): """Remove the job identified by 'queue_id' from the queue. Input: queue_id: Unique identifier for a job. Output: None *** NOTE: A pipeline_utils.PipelineError is raised if the job removal fails. """ cmd = "qsig -s SIGINT %s" % queue_id pipe = subprocess.Popen(cmd, shell=True) # Wait a few seconds a see if the job is still being tracked by # the queue manager, or if it marked as exiting. time.sleep(5) batch = PBSQuery.PBSQuery().getjobs() if (queue_id in batch) and ('E' not in batch[queue_id]['job_state']): errormsg = "The job (%s) is still in the queue " % queue_id errormsg += "and is not marked as exiting (status = 'E')!\n" raise pipeline_utils.PipelineError(errormsg)
def _get_submit_node(self): """Return the name of the node to submit to. Inputs: None Output: node: The name of the node that the next job should be submitted to. """ batch = PBSQuery.PBSQuery() nodes = batch.getnodes_with_property(self.property) max_cpus_free = -1 node = None for n in nodes.keys(): num_jobs = len(nodes[n].setdefault('jobs', [])) if (nodes[n]['state'] != ['free']) or \ (num_jobs >= self.max_jobs_per_node): continue cpus_free = int(nodes[n]['np'][0]) - num_jobs if cpus_free > max_cpus_free: node = n max_cpus_free = cpus_free return node
def note( self, node, note_attr ): ''' This method combines all note methods and returns the new note ''' p = PBSQuery.PBSQuery() p.new_data_structure() pbs_info = p.getnode( node ) pre_parts = list() old_note = None new_note = None if pbs_info.has_key( 'note' ): pbs_note = pbs_info[ 'note' ] if len( pbs_note ) > 4: pre_parts = pbs_note[:4] old_note = ', '.join( pbs_note[4:] ) pre_parts[1] = self.create_date() pre_parts[2] = self.note_return_username( pre_parts[2] ) else: pre_parts = self.note_init() if note_attr.has_key( 'ticket' ): pre_parts[3] = self.note_check_ticket( note_attr['ticket'], pre_parts[3] ) if note_attr.has_key( 'note' ) and note_attr.has_key( 'mode' ): if note_attr[ 'note' ] and note_attr[ 'mode' ] in [ 'a','w' ]: if old_note: new_note = self.note_create( note_attr[ 'note' ], note_attr[ 'mode' ], old_note ) else: new_note = self.note_create( note_attr[ 'note' ], note_attr[ 'mode' ] ) else: new_note = old_note return '%s,%s' % ( ','.join( pre_parts ), new_note )
def print_get_nodes(hosts=None): '''This function retrieves the information from your batch environment''' if ARGS_VERBOSE: _print('func:print_get_nodes input:%s' % str(hosts), file=sys.stderr) ## there are 2 possible filters, by hostname, or by state pbsq = PBSQuery.PBSQuery() split_1 = dict() split_2 = dict() if ARGS_VERBOSE: _print('func:print_get_nodes fetching node information', file=sys.stderr) ## We ask from the batch all nodes, and with the properties state and note for host, properties in pbsq.getnodes(['state', 'note']).items(): do_host = None ## Check if the current host matches our criterium (given with the arguments ## or has the allowed state) if hosts and host in hosts: do_host = host elif not hosts: ## Do a intersection on both set's, if there is a match, then the host is allowed if bool(ALLOWED_STATES.intersection(set(properties.state))): do_host = host ## when we have a do_host (means matches our criterium) then sort ## them by basename if do_host: if SPLIT_SORT and re.findall(SPLIT_SORT, do_host): split_1[host] = properties else: split_2[host] = properties if ARGS_VERBOSE: _print('func:print_get_nodes returning values', file=sys.stderr) return split_1, split_2
#!/usr/bin/env python import os, subprocess, shutil, glob, time, datetime, pytz, config, utils, database, PBSQuery, time prestodir = os.environ["PRESTO"] #checkpoints = glob.glob(os.path.join(config.jobsdir, "*.checkpoint")) checkpoints = [] queue = PBSQuery.PBSQuery() print("Starting GBNCC job submitter...") while True: print("Connecting to database") db = database.Database("observations") query = "SELECT ProcessingID,FileName FROM GBNCC WHERE "\ "ProcessingStatus='i'" print("Updating job states") db.execute(query) ret = db.fetchall() if len(ret) != 0: print("Getting all currently running jobs") alljobs = queue.getjobs() if alljobs is not None: for jobid, filenm in ret: if alljobs.has_key(str(jobid)): if alljobs[str(jobid)]["job_state"][0] == "R": nodenm = alljobs[str(jobid)]["exec_host"][0] jobnm = alljobs[str(jobid)]["Job_Name"][0] #checkpoint = os.path.join(config.jobsdir, jobnm+".checkpoint") #with open(checkpoint, "w") as f: # f.write(nodenm+"\n")
def pbs_batch( self, nodes, attrs=None, note_attributes=None ): nodeserror = list() if not attrs and not note_attributes: raise sara_nodesException, 'attrs and note_attributes can not be empty together!' if not self.dryrun: if note_attributes and len( note_attributes ) == 3: if attrs: attributes = attrs + pbs.new_attropl(1) attributes[1].name = pbs.ATTR_NODE_note attributes[1].op = pbs.SET else: attributes = pbs.new_attropl(1) attributes[0].name = pbs.ATTR_NODE_note attributes[0].op = pbs.SET else: attributes = attrs # Some hacking here because some limitation in the Torque 2.4 version # fetching note data first for all nodes! tmp_node_note = dict() for node in nodes: if note_attributes and len( note_attributes ) == 3: tmp_node_note[ node ] = self.note( node, note_attributes ) pbs_server = pbs.pbs_default() if not pbs_server: raise sara_nodesException, 'Default pbs server not found!' pbs_connection = pbs.pbs_connect( pbs_server ) for node in nodes: if note_attributes and len( note_attributes ) == 3: try: if attrs: attributes[1].value = tmp_node_note[ node ] else: attributes[0].value = tmp_node_note[ node ] except KeyError: pass rcode = pbs.pbs_manager( pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node, attributes, 'NULL' ) if rcode > 0: errno, text = pbs.error() nodeserror.append( '%s: %s (%s)' % ( node, text, errno ) ) else: p = PBSQuery.PBSQuery() pbsnodes = p.getnodes().keys() print '%*s:' % ( 7, 'Nodes' ), firstitem = True for node in nodes: if node in pbsnodes: if firstitem: print '%s' % node firstitem = False else: print '%*s' % ( 17, node ) else: nodeserror.append( '%s: does not exist' % node ) if len( nodeserror ) > 0: raise sara_nodesException, nodeserror
def __init__(self, pbs_server=None): self.pbsq = PBSQuery.PBSQuery(server=pbs_server) self.pbs_server = pbs_server