def cobalt_query(state): cqm = ComponentProxy('queue-manager', defer=True) scheduler = ComponentProxy('scheduler', defer=True) if state not in ('running', 'queued', 'reservation'): return None # Templates for queries to coblat query_job = dict.fromkeys(job_query_fields, '*') query_res = dict.fromkeys(reservation_query_fields, '*') if state == 'reservation': return scheduler.get_reservations([query_res]) if state == 'running' or state == 'starting': query_job['state'] = 'running' query_job['location'] = '*' if state == 'queued': query_job['state'] = 'queued' query_job['score'] = '*' return cqm.get_jobs([query_job])
cqm = ComponentProxy("queue-manager", defer=False) except ComponentLookupError: print >> sys.stderr, "Failed to connect to queue manager" sys.exit(1) for i in range(len(args)): if args[i] == '*': continue try: args[i] = int(args[i]) except: logger.error("jobid must be an integer") raise SystemExit, 1 if opts['start']: query = [{ 'tag': 'job', 'jobid': jid, 'is_active': False, 'has_completed': False } for jid in args] else: query = [{'tag': 'job', 'jobid': jid} for jid in args] while True: response = cqm.get_jobs(query) if len(response) == 0: raise SystemExit, 0 else: time.sleep(2)
query.append({'tag':'job', 'jobid':int(n), 'queue':'*'}) except ValueError: print "%s is not a valid jobid or queue name" % n sys.exit(2) for q in query: for h in long_header: if h == 'JobName': q.update({'outputpath':'*'}) elif h not in ['JobID', 'Queue']: q.update({h.lower():'*'}) if h in query_dependencies.keys(): for x in query_dependencies[h]: if x not in header: q.update({x.lower():'*'}) q["user"] = user_name response = cqm.get_jobs(query) if len(args) and not response: sys.exit(1) if opts['Q']: for q in response: if q['maxtime'] is not None: q['maxtime'] = "%02d:%02d:00" % (divmod(int(q['maxtime']), 60)) if q['mintime'] is not None: q['mintime'] = "%02d:%02d:00" % (divmod(int(q['mintime']), 60)) output = [[q[x] for x in [y.lower() for y in header]] for q in response] else: if response: maxjoblen = max([len(str(item.get('jobid'))) for item in response]) jobidfmt = "%%%ss" % maxjoblen
try: pm = ComponentProxy("process-manager", defer=False) except ComponentLookupError: print >> sys.stderr, "Failed to connect to process manager" sys.exit(1) r = pm.add_jobs([{ 'tag': 'process-group', 'user': user, 'args': [], 'env': {}, 'executable': '/tmp/testscript', 'size': 700, 'cwd': '/tmp', 'location': ['ANLR00'], 'outputfile': '/tmp/test1-output', 'errorfile': '/tmp/test1-error', 'id': '*' }]) print "jobs : " + ` len(r) ` pgid = r[0]['id'] while True: r = pm.get_jobs([{'tag': 'process-group', 'id': pgid, 'state': '*'}]) state = r[0]['state'] if state == 'running': time.sleep(5) continue else: break print "process group %s has completed" % (pgid)
try: args[i] = int(args[i]) all_jobs.add(args[i]) except: logger.error("jobid must be an integer") raise SystemExit, 1 check_specs = [{ 'tag': 'job', 'user': user, 'jobid': jobid, 'user_hold': '*' } for jobid in args] try: check_response = cqm.get_jobs(check_specs) except xmlrpclib.Fault, flt: print flt.faultString raise SystemExit, 1 jobs_existed = [j.get('jobid') for j in check_response] all_jobs = all_jobs.union(set(jobs_existed)) update_specs = [{ 'tag': 'job', 'user': user, 'jobid': jobid, 'user_hold': "*", 'is_active': "*" } for jobid in jobs_existed] updates = {'user_hold': True}
try: args[i] = int(args[i]) except: logger.error("jobid must be an integer") raise SystemExit, 1 spec = [{'tag':'job', 'user':user, 'jobid':jobid, 'project':'*', 'notify':'*', 'walltime':'*', 'queue':'*', 'procs':'*', 'nodes':'*'} for jobid in args] try: filters = CP.get('cqm', 'filters').split(':') except ConfigParser.NoOptionError: filters = [] try: jobdata = cqm.get_jobs(spec) except xmlrpclib.Fault, flt: print flt.faultString raise SystemExit, 1 if not jobdata: print "Failed to match any jobs" sys.exit(1) response = [] for jobinfo in jobdata: original_spec = jobinfo.copy() jobinfo.update({'queue': queue}) for filt in filters: Cobalt.Util.processfilter(filt, jobinfo) try:
from Cobalt.Exceptions import ComponentLookupError import Cobalt.Util if __name__ == '__main__': level = 20 if '-d' in sys.argv: level = 10 Cobalt.Logging.setup_logging('cmd', to_syslog=False, level=0) user = pwd.getpwuid(os.getuid())[0] try: pm = ComponentProxy("process-manager", defer=False) except ComponentLookupError: print >> sys.stderr, "Failed to connect to process manager" sys.exit(1) r = pm.add_jobs([{'tag':'process-group', 'user':user, 'args':[], 'env':{}, 'executable':'/tmp/testscript', 'size':700, 'cwd':'/tmp', 'location':['ANLR00'], 'outputfile':'/tmp/test1-output', 'errorfile':'/tmp/test1-error', 'id': '*'}]) print "jobs : " + `len(r)` pgid = r[0]['id'] while True: r = pm.get_jobs([{'tag':'process-group', 'id':pgid, 'state':'*'}]) state = r[0]['state'] if state == 'running': Cobalt.Util.sleep(5) continue else: break print "process group %s has completed" % (pgid)
if opts['held']: query['state'] = opts['held'] else: query['state'] = '*' if opts['queue']: query['queue'] = opts['queue'] else: query['queue'] = '*' try: cqm = ComponentProxy("queue-manager", defer=False) query['tag'] = 'job' query['jobid'] = '*' response = cqm.get_jobs([query]) except ComponentLookupError: logger.error("Can't connect to the queue manager") sys.exit(1) #except: #$logger.error("Error querying jobs") #sys.exit(1) # log jobid to stdout if not response: Cobalt.Logging.logging.error("Failed to match any jobs") else: Cobalt.Logging.logging.debug(response) print " The following jobs matched your query:" for job in response:
logger.error("node count out of realistic range") sys.exit(1) updates['nodes'] = opts['nodecount'] # ensure time is actually in minutes if opts['time']: if opts['time'][0] in ['+', '-']: try: minutes = Cobalt.Util.get_time(opts['time'][1:]) except Cobalt.Exceptions.TimeFormatError, e: print "invalid time specification: %s" % e.args[0] sys.exit(1) jobdata = None try: cqm = ComponentProxy("queue-manager", defer=False) jobdata = cqm.get_jobs(spec) except ComponentLookupError: print >> sys.stderr, "Failed to connect to queue manager" sys.exit(1) if not jobdata: print "Failed to match any jobs" sys.exit(1) if opts['time'][0] == '-': new_time = float(jobdata[0]['walltime']) - minutes if new_time <= 0: print >> sys.stderr, "invalid wall time: ", new_time else: updates['walltime'] = str( float(jobdata[0]['walltime']) - minutes) elif opts['time'][0] == '+':
if opts["held"]: query["state"] = opts["held"] else: query["state"] = "*" if opts["queue"]: query["queue"] = opts["queue"] else: query["queue"] = "*" try: cqm = ComponentProxy("queue-manager", defer=False) query["tag"] = "job" query["jobid"] = "*" response = cqm.get_jobs([query]) except ComponentLookupError: logger.error("Can't connect to the queue manager") sys.exit(1) # except: # $logger.error("Error querying jobs") # sys.exit(1) # log jobid to stdout if not response: Cobalt.Logging.logging.error("Failed to match any jobs") else: Cobalt.Logging.logging.debug(response) print " The following jobs matched your query:" for job in response:
def test_something(self): logging.basicConfig() try: cqm = ComponentProxy("queue-manager") except ComponentLookupError: assert not "failed to connect to queue manager" # add a queue queues = cqm.add_queues([{"tag": "queue", "name": "default"}]) assert len(queues) == 1 # try adding a job to a queue that doesn't exist try: jobs = cqm.add_jobs([{"tag": "job", "queue": "jonx"}]) except xmlrpclib.Fault: # trying to add a job to a queue that doesn't exist results in an xmlrpc Fault pass else: assert not "Adding job to non-existent queue should raise xmlrpclib.Fault" # get the list of available partitions and add them to the pool of managed partitions try: simulator = ComponentProxy("system") except ComponentLookupError: assert not "failed to connect to simulator" for part_name in self.system._partitions: partitions = simulator.add_partitions([{"tag": "partition", "name": part_name, "queue": "default"}]) assert len(partitions) == 1 partitions = simulator.set_partitions( [{"tag": "partition", "name": part_name}], {"functional": True, "scheduled": True} ) assert len(partitions) == 1 partitions = simulator.get_partitions([{"name": "*", "size": "*", "queue": "*"}]) assert len(partitions) > 0 # now run a real job # # 1. add the job to the default queue # 2. obtain a partition for it to run on # 3. start running it on that paritition # 4. check that it started running # 5. sleep for a bit, and then check that it's still running # 6. sleep some more and then check to see if it actually finished running nodes = partitions[0]["size"] jobs = cqm.add_jobs( [ { "queue": "default", "mode": "co", "command": "/bin/ls", "outputdir": os.getcwd(), "walltime": 4, "nodes": nodes, "procs": nodes, "args": [], "user": "******", "jobid": "*", } ] ) assert len(jobs) == 1 job = jobs[0] jobid = job["jobid"] job_location_args = [ { "jobid": jobid, "nodes": job["nodes"], "queue": job["queue"], "utility_score": 1, "threshold": 1, "walltime": job["walltime"], "attrs": {}, } ] locations = simulator.find_job_location(job_location_args, []) assert locations.has_key(jobid) location = locations[jobid] cqm.run_jobs([{"jobid": jobid}], location) r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}]) if not r: assert not "the job didn't start" time.sleep(20) r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}]) if len(r) != 1: assert not "the job has stopped running prematurely" start_time = time.time() while True: r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}]) if r: if time.time() - start_time > 240: assert not "the job seems to have run overtime" else: time.sleep(5) else: break # this time, we'll add a job to the queue, start the job, sleep for a bit # and then try to kill the job before it has finished nodes = partitions[0]["size"] jobs = cqm.add_jobs( [ { "queue": "default", "mode": "co", "command": "/bin/ls", "outputdir": os.getcwd(), "walltime": 4, "nodes": nodes, "procs": nodes, "args": [], "user": "******", "jobid": "*", } ] ) assert len(jobs) == 1 job = jobs[0] jobid = job["jobid"] job_location_args = [ { "jobid": jobid, "nodes": job["nodes"], "queue": job["queue"], "utility_score": 1, "threshold": 1, "walltime": job["walltime"], "attrs": {}, } ] locations = simulator.find_job_location(job_location_args, []) assert locations.has_key(jobid) location = locations[jobid] cqm.run_jobs([{"jobid": jobid}], location) r = cqm.get_jobs([{"jobid": jobid, "state": "*", "is_active": True}]) if not r: assert not "the job didn't start" time.sleep(20) r = cqm.get_jobs([{"jobid": jobid, "is_active": True}]) if len(r) != 1: assert not "the job has stopped running prematurely" cqm.del_jobs([{"jobid": jobid}]) start_time = time.time() while True: r = cqm.get_jobs([{"jobid": jobid, "is_active": True, "state": "*"}]) if r: if time.time() - start_time > 30: assert not "the job didn't die when asked to" else: time.sleep(1) else: break
def test_something(self): logging.basicConfig() try: cqm = ComponentProxy("queue-manager") except ComponentLookupError: assert not "failed to connect to queue manager" # add a queue queues = cqm.add_queues([{'tag': "queue", 'name': "default"}]) assert len(queues) == 1 # try adding a job to a queue that doesn't exist try: jobs = cqm.add_jobs([{'tag': "job", 'queue': "jonx"}]) except xmlrpclib.Fault: # trying to add a job to a queue that doesn't exist results in an xmlrpc Fault pass else: assert not "Adding job to non-existent queue should raise xmlrpclib.Fault" # get the list of available partitions and add them to the pool of managed partitions try: simulator = ComponentProxy("system") except ComponentLookupError: assert not "failed to connect to simulator" for part_name in self.system._partitions: partitions = simulator.add_partitions([{ 'tag': "partition", 'name': part_name, 'queue': "default" }]) assert len(partitions) == 1 partitions = simulator.set_partitions([{ 'tag': "partition", 'name': part_name }], { 'functional': True, 'scheduled': True }) assert len(partitions) == 1 partitions = simulator.get_partitions([{ 'name': "*", 'size': "*", 'queue': "*" }]) assert len(partitions) > 0 # now run a real job # # 1. add the job to the default queue # 2. obtain a partition for it to run on # 3. start running it on that paritition # 4. check that it started running # 5. sleep for a bit, and then check that it's still running # 6. sleep some more and then check to see if it actually finished running nodes = partitions[0]['size'] jobs = cqm.add_jobs([{ 'queue': "default", 'mode': "co", 'command': "/bin/ls", 'outputdir': os.getcwd(), 'walltime': 4, 'nodes': nodes, 'procs': nodes, 'args': [], 'user': "******", 'jobid': "*" }]) assert len(jobs) == 1 job = jobs[0] jobid = job['jobid'] job_location_args = [{ 'jobid': jobid, 'nodes': job['nodes'], 'queue': job['queue'], 'utility_score': 1, 'threshold': 1, 'walltime': job['walltime'], 'attrs': {} }] locations = simulator.find_job_location(job_location_args, []) assert locations.has_key(jobid) location = locations[jobid] cqm.run_jobs([{'jobid': jobid}], location) r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}]) if not r: assert not "the job didn't start" time.sleep(20) r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}]) if len(r) != 1: assert not "the job has stopped running prematurely" start_time = time.time() while True: r = cqm.get_jobs([{ 'jobid': jobid, 'state': "*", 'is_active': True }]) if r: if time.time() - start_time > 240: assert not "the job seems to have run overtime" else: time.sleep(5) else: break # this time, we'll add a job to the queue, start the job, sleep for a bit # and then try to kill the job before it has finished nodes = partitions[0]['size'] jobs = cqm.add_jobs([{ 'queue': "default", 'mode': "co", 'command': "/bin/ls", 'outputdir': os.getcwd(), 'walltime': 4, 'nodes': nodes, 'procs': nodes, 'args': [], 'user': "******", 'jobid': "*" }]) assert len(jobs) == 1 job = jobs[0] jobid = job['jobid'] job_location_args = [{ 'jobid': jobid, 'nodes': job['nodes'], 'queue': job['queue'], 'utility_score': 1, 'threshold': 1, 'walltime': job['walltime'], 'attrs': {} }] locations = simulator.find_job_location(job_location_args, []) assert locations.has_key(jobid) location = locations[jobid] cqm.run_jobs([{'jobid': jobid}], location) r = cqm.get_jobs([{'jobid': jobid, 'state': "*", 'is_active': True}]) if not r: assert not "the job didn't start" time.sleep(20) r = cqm.get_jobs([{'jobid': jobid, 'is_active': True}]) if len(r) != 1: assert not "the job has stopped running prematurely" cqm.del_jobs([{'jobid': jobid}]) start_time = time.time() while True: r = cqm.get_jobs([{ 'jobid': jobid, 'is_active': True, 'state': "*" }]) if r: if time.time() - start_time > 30: assert not "the job didn't die when asked to" else: time.sleep(1) else: break
Cobalt.Logging.setup_logging('cobalt-mpirun', to_syslog=False, level=level) logger = logging.getLogger('cobalt-mpirun') try: os.environ["COBALT_JOBID"] = os.environ["COBALT_JOBID"] except KeyError: logger.error("cobalt-mpirun must be invoked by a script submitted to cobalt.") raise SystemExit, 1 try: cqm = ComponentProxy("queue-manager", defer=False) except ComponentLookupError: print >> sys.stderr, "Failed to connect to queue manager" sys.exit(1) response = cqm.get_jobs([{'tag':'job', 'jobid':int(os.environ["COBALT_JOBID"]), 'state':'*', 'procs':'*', 'location':'*', 'walltime':'*', 'outputdir':'*'}]) if len(response) == 0: logger.error("Error: cqm did not find a job with id " + os.environ["COBALT_JOBID"]) raise SystemExit, 1 if len(response) > 1: logger.error("Error: cqm did not find a unique job with id " + os.environ["COBALT_JOBID"]) raise SystemExit, 1 j = response[0] if j['location'] is None: logger.error("Error: cobalt-mpirun's parent is in state '%s' and has not specified a partition." % j['state']) raise SystemExit, 1 # j['location'] = "ANLR00" arglist = ['-partition', j['location'][0]] + arglist
all_jobs = set() for i in range(len(args)): if args[i] == '*': continue try: args[i] = int(args[i]) all_jobs.add(args[i]) except: logger.error("jobid must be an integer") sys.exit(1) check_specs = [{'tag':'job', 'user':user, 'jobid':jobid, 'user_hold':'*'} for jobid in args] try: check_response = cqm.get_jobs(check_specs) except xmlrpclib.Fault, flt: print flt.faultString raise SystemExit, 1 jobs_existed = [j.get('jobid') for j in check_response] all_jobs = all_jobs.union(set(jobs_existed)) update_specs = [{'tag':'job', 'user':user, 'jobid':jobid, 'user_hold':"*", 'is_active':"*"} for jobid in jobs_existed] if opt.deps: updates = {'all_dependencies': []} else: updates = {'user_hold':False} try: update_response = cqm.set_jobs(update_specs, updates, user)