def get_jobs_by_node_ids(): if mocking: jobs = mock('jobs.json') else: jobs = pyslurm.job().get() print "Post datas : %s" % request.data nodes = json.loads(request.data).get('nodes', []) print "Nodelist : %s" % nodes returned_jobs = {} # filter jobs by node for jobid, job in jobs.iteritems(): nodes_list = job['cpus_allocated'].keys() print "Nodelist for %s : %s" % (jobid, nodes_list) for node_id in nodes: if node_id in nodes_list: returned_jobs[jobid] = job print "Node %s added to jobs : %s" % (node_id, returned_jobs) if not mocking: for jobid, job in returned_jobs.iteritems(): fill_job_user(job) return returned_jobs
def show_job(job_id): # pyslurm >= 16.05 expects a string in parameter of job.find_id() job = pyslurm.job().find_id(str(job_id)) fill_job_user(job) return job
def test_job_kill(): """Job: Test job().slurm_kill_job().""" test_job_search_before = pyslurm.job().find(name="name", val="pyslurm_test_job") test_job_id = test_job_search_before[0] time.sleep(3) rc = pyslurm.slurm_kill_job(test_job_id, Signal=9, BatchFlag=pyslurm.KILL_JOB_BATCH) assert_equals(rc, 0)
def get_jobs(): jobs = pyslurm.job().get() # add login and username (additionally to UID) for each job for jobid, job in jobs.iteritems(): fill_job_user(job) return jsonify(jobs)
def show_job(job_id): if mocking: return mock_job(job_id) job = pyslurm.job().find_id(job_id) fill_job_user(job) return job
def rpc(self): job_list = pyslurm.job().find_id(str(self.jobid)) job = job_list[0] self.gres = None if 'gres' in job.keys(): self.gres = job['gres'] self.shared = job['shared'] != '0' self.end = datetime.fromtimestamp(job['end_time'], localtz())
def test_job_scontrol(): """Job: Compare scontrol values to PySlurm values.""" all_job_ids = pyslurm.job().ids() # Make sure job is running first test_job = all_job_ids[0] test_job_info = pyslurm.job().find_id(test_job)[0] assert_equals(test_job, test_job_info["job_id"]) sctl = subprocess.Popen(["scontrol", "-d", "show", "job", str(test_job)], stdout=subprocess.PIPE).communicate() sctl_stdout = sctl[0].strip().decode("UTF-8", "replace").split() sctl_dict = dict((value.split("=")[0], value.split("=")[1]) for value in sctl_stdout) assert_equals(test_job_info["batch_flag"], int(sctl_dict["BatchFlag"])) assert_equals(test_job_info["cpus_per_task"], int(sctl_dict["CPUs/Task"])) assert_equals(test_job_info["contiguous"], int(sctl_dict["Contiguous"])) assert_equals(test_job_info["exit_code"], sctl_dict["ExitCode"]) assert_equals(test_job_info["job_id"], int(sctl_dict["JobId"])) assert_equals(test_job_info["name"], sctl_dict["JobName"]) assert_equals(test_job_info["job_state"], sctl_dict["JobState"]) assert_equals(test_job_info["nice"], int(sctl_dict["Nice"])) assert_equals(test_job_info["num_cpus"], int(sctl_dict["NumCPUs"])) assert_equals(test_job_info["num_nodes"], int(sctl_dict["NumNodes"])) assert_equals(test_job_info["partition"], sctl_dict["Partition"]) assert_equals(test_job_info["priority"], int(sctl_dict["Priority"])) assert_equals(test_job_info["state_reason"], sctl_dict["Reason"]) assert_equals(test_job_info["reboot"], int(sctl_dict["Reboot"])) assert_equals(test_job_info["requeue"], int(sctl_dict["Requeue"])) assert_equals(test_job_info["restart_cnt"], int(sctl_dict["Restarts"])) assert_equals(test_job_info["std_err"], sctl_dict["StdErr"]) assert_equals(test_job_info["std_in"], sctl_dict["StdIn"]) assert_equals(test_job_info["std_out"], sctl_dict["StdOut"]) assert_equals(test_job_info["time_limit_str"], sctl_dict["TimeLimit"]) assert_equals(test_job_info["work_dir"], sctl_dict["WorkDir"])
def main(): """ Do some stuff, eventually printing output to stdout... """ # Parse command-line arguments arguments = parse_arguments() # Logging setup if arguments.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) Nodes = pyslurm.node() node_dict = Nodes.get() Jobs = pyslurm.job() job_dict = Jobs.get() #print(job_dict) if len(node_dict) > 0 and len(job_dict) > 0: nt = get_nodetag(node_dict, job_dict, arguments) pc = get_pending(job_dict) if arguments.csv: print_csv(arguments.csv_header_suppress, nt, pc) else: js=get_aggregated_jobs(job_dict, arguments) print_usage(js) else: print "No Nodes and/or no Jobs found !" sys.exit() node_reservations = get_node_reservations() jobs = get_jobs(all_jobs=arguments.all_jobs) cred_totals, public_cores, public_nodes, public_nodes_free = get_counts(node_reservations, jobs) if arguments.free_cores: print_free_cores(cred_totals, public_cores) elif arguments.csv: print_csv(arguments.csv_header_suppress, cred_totals, public_cores, public_nodes, public_nodes_free) else: print_output(cred_totals, public_cores, public_nodes, public_nodes_free)
def run_slurm_cmd(cmd, name): """ For very simple situations where you just want to run a simple shell command via slurm this will do the trick. cmd --command that can be run from a shell name --job name returns --the job ID, which is used as a handle for other functions """ daos_test_job = {"wrap": cmd, "job_name": name} jobid = pyslurm.job().submit_batch_job(daos_test_job) return jobid
def get_jobs(): jobs = pyslurm.job().get() for jobid, job in jobs.iteritems(): # add login and username (additionally to UID) for each job try: fill_job_user(job) except (KeyError): pass # convert nodeset in array of nodes if job["nodes"] is not None: jobs[jobid]["nodeset"] = list( NodeSet(job["nodes"].encode('ascii', 'ignore')) ) return jobs
def check_slurm_job(handle): """ Get the state of a job initiated via slurm. handle --slurm job id returns --one of the slurm defined JOB_STATE_CODES strings plus one extra UNKNOWN if the handle doesn't match a known slurm job. """ job_attributes = pyslurm.job().find_id(handle) if job_attributes and len(job_attributes) > 0: state = job_attributes[0]["job_state"] else: state = "UNKNOWN" return state
def get_jobs_by_qos(): jobs = pyslurm.job().get() qos = pyslurm.qos().get() returned_qos = {} for qos_id, q in qos.iteritems(): returned_jobs = {} # filter jobs by node for jobid, job in jobs.iteritems(): if qos_id == job['qos']: returned_jobs[jobid] = job returned_qos[qos_id] = returned_jobs return returned_qos
def get_jobs_by_nodes(): jobs = pyslurm.job().get() nodes = pyslurm.node().get() returned_nodes = {} for node_id, node in nodes.iteritems(): returned_jobs = {} # filter jobs by node for jobid, job in jobs.iteritems(): nodes_list = job['cpus_allocated'].keys() if node_id in nodes_list: returned_jobs[jobid] = job returned_nodes[node_id] = returned_jobs return returned_nodes
def get_jobs_by_node_id(node_id): jobs = pyslurm.job().get() returned_jobs = {} # filter jobs by node for jobid, job in jobs.iteritems(): nodes_list = job['cpus_allocated'].keys() print "Nodelist for %s : %s" % (node_id, nodes_list) if node_id in nodes_list: returned_jobs[jobid] = job print "Node %s added to jobs : %s" % (node_id, returned_jobs) for jobid, job in returned_jobs.iteritems(): fill_job_user(job) return returned_jobs
def show_job(job_id): job = pyslurm.job().find_id(job_id) fill_job_user(job) return job
def get_queued_jobs_ids() -> List[int]: user_jobs = pyslurm.job().find_user(USERNAME).items() return [job[0] for job in user_jobs if job[1]['job_state'] in {'PENDING', 'RUNNING'} and job[1]['name'] != CONFIG['self_job']['job_name']] # TODO add more conditions
def submit_job(params: Dict[str, str]) -> int: time.sleep(SUBMIT_DELAY_MS) return pyslurm.job().submit_batch_job(params)
part] += node_data['real_memory'] * 1048576 metrics['partition']['mem_usage'][ part] += node_data['alloc_mem'] * 1048576 metrics['partition']['mem_usage_pc'][part] = 100 * ( float(metrics['partition']['mem_usage'][part]) / metrics['partition']['mem_total'][part]) metrics['partition']['gpu_total'][part] += gpu_total metrics['partition']['gpu_usage'][part] += gpu_usage if metrics['partition']['gpu_total'][part] > 0: metrics['partition']['gpu_usage_pc'][part] = 100 * ( float(metrics['partition']['gpu_usage'][part]) / metrics['partition']['gpu_total'][part]) # Now go through the jobs list to see user-specific stuff jobs = pyslurm.job().get() for job in jobs: job = jobs.get(job) if job['user_id'] not in user_ids: user = pwd.getpwuid(job['user_id'])[0] user_ids[job['user_id']] = user metrics['user']['cpu_usage'][user] = 0 metrics['user']['gpu_usage'][user] = 0 metrics['user']['mem_usage'][user] = 0 metrics['user']['jobs_running'][user] = 0 metrics['user']['jobs_pending'][user] = 0 metrics['user']['queue_time'][user] = 0 metrics['user']['queue_jobs'][user] = 0 if config['user_lookup']:
def test_job_get(): """Job: Test job().get() return type.""" all_jobs = pyslurm.job().get() assert isinstance(all_jobs, dict)
def SubmitJob(bashpath, script, filename, user): ''' This function submites a job. it needs the filepath of the bashfile, the script you want to run, and the username. it travels to home/user and submits the job through the command line by doing 'sbatch Bash.sh'. it aslso makes the directories where slurm.out fies are stored ''' currDir = os.getcwd() #go into jobsub folder to execute batch script os.chdir("/home/" + user) # copy bashfile and script into JobSub directory shutil.copy(script, os.getcwd()) shutil.copy(bashpath, os.getcwd()) # grab the Bash Scriptname and Script name form the full paths BashScriptName = ntpath.basename(bashpath) ScriptName = ntpath.basename(script) # rename the script to what is listed in the bash file os.rename(ScriptName, filename) # submit a job a = pyslurm.job() # get the jobid so we know what folder to put the files in try: command = "sbatch " + BashScriptName #result = subprocess.check_output(["sbatch", BashScriptName]) #print(command) result = subprocess.check_output( ["runuser", "-l", user, "-c", command]) #print(result) result = result.split()[-1] jobid = int(result) #jobid = a.submit_batch_job({'script': BashScriptName}) #print(jobid) time.sleep(0.3) #print("done") #print("Job Name is "+str(jobName)) except Exception: os.chdir(currDir) return False #print("jobid = "+str(jobid)) # make the directory with full permisions # it will be named after the jobid job_name = pyslurm.job().get()[jobid]['name'] newDir = str(job_name) + "-" + str(jobid) #print("Dir created: "+newDir) os.mkdir(str(newDir), mode=0o777) # move the bash script, actual script, and slurm.out to new folder jobid # if jobid=13, the folder is named 13 #shutil .move(BashScriptName, str(jobid)) #shutil.move(filename, str(jobid)) slurmname = "slurm-" + str(jobid) + ".out" if os.path.isfile(slurmname): #print("slurm file exists") shutil.move(slurmname, str(newDir)) shutil.move(BashScriptName, str(newDir)) time.sleep(0.3) shutil.copy(filename, str(newDir)) #shutil.move(filename, str(jobid)) # go back to original directory not to f**k with anything os.chdir(currDir) return True
def test_job_ids(): """Job: Test job().ids() return type.""" all_job_ids = pyslurm.job().ids() assert isinstance(all_job_ids, list)
def test_job_count(): """Job: Test job count.""" all_jobs = pyslurm.job().get() all_job_ids = pyslurm.job().ids() assert_equals(len(all_jobs), len(all_job_ids))
def test_job_count(): """Job: Test job count.""" all_jobs = pyslurm.job().get() all_job_ids = pyslurm.job().ids() assert len(all_jobs) == len(all_job_ids)
def test_job_count(): all_jobs = pyslurm.job().get() all_job_ids = pyslurm.job().ids() assert len(all_jobs) == len(all_job_ids)
def test_job_get(): all_jobs = pyslurm.job().get() assert type(all_jobs) is DictType
users = [] if job_dict: for jobid, value in sorted(job_dict.iteritems()): if value["account"] not in users: users.append(value["account"]) return users if __name__ == "__main__": try: pyslurmjob = pyslurm.job() jobs = pyslurmjob.get() except ValueError as e: print 'Job query failed - %s' % (e) sys.exit(1) users = list_users(jobs) delim = "+-------------------------------------------+-----------+------------+---------------+-----------------+--------------+--------------+" print delim print "| USER (NAME) | CPUS USED | NODES USED | CPU REQUESTED | NODES REQUESTED | JOBS RUNNING | JOBS PENDING |" print delim total_procs_request = 0 total_nodes_request = 0 total_procs_used = 0
def test_job_submit(): """Job: Test job().submit_batch_job().""" test_job = {"wrap": "sleep 3600", "job_name": "pyslurm_test_job"} test_job_id = pyslurm.job().submit_batch_job(test_job) test_job_search = pyslurm.job().find(name="name", val="pyslurm_test_job") assert_true(test_job_id in test_job_search)
xml_file = open(slurm_file, 'w') # # Get the controllers # primary, backup = pyslurm.get_controllers() xml_file.write('<?xml version="1.0" encoding="iso-8859-1" ?>\n') xml_file.write("<slurm>\n") xml_file.write("\t<lastUpdate>{0}</lastUpdate>\n".format(now)) # # XML output of Jobs # a = pyslurm.job() jobs = a.get() xml_file.write("\t<jobs>\n") for key, value in jobs.items(): xml_file.write('\t\t<job>\n') xml_file.write("\t\t\t<id>{0}</id>\n".format(key)) for job_key in sorted(value.items()): xml_file.write("\t\t\t<{0}>{1}</{2}>\n".format(job_key[0], job_key[1], job_key[0])) b = pyslurm.jobstep(key, 0, 0) steps = b.get() for job, job_step in sorted(steps.items()): xml_file.write('\t\t\t<jobstep>\n')
def test_job_find_user_int(): """Job: Test job().find_user() (Integer).""" user = 0 test_job_output = pyslurm.job().find_user(user) assert isinstance(test_job_output, dict)
Memory = meminfo(my_host, rrd) sys.stdout.write("\t<memTotal>{0}</memTotal>\n".format(Memory['MemTotal'])) sys.stdout.write("\t<memFree>{0}</memFree>\n".format(Memory['MemFree'])) sys.stdout.write("\t<cached>{0}</cached>\n".format(Memory['Cached'])) sys.stdout.write("\t<buffers>{0}</buffers>\n".format(Memory['Buffers'])) a = pyslurm.slurm_load_slurmd_status() if a: for host, data in a.iteritems(): sys.stdout.write("\t<slurmd>\n") for key, value in data.iteritems(): sys.stdout.write("\t\t<{0}>{1}</{0}>\n".format(key,value,key)) sys.stdout.write("\t</slurmd>\n") a = pyslurm.job() jobs = a.get() now = int(time.time()) PiDs = {} for key, value in jobs.iteritems(): jobid = key if value['job_state'] == "RUNNING": userid = pwd.getpwuid(value[4])[0] nodes = value['alloc_node'].split(',') if my_host in nodes: PiDs[jobid] = [] a = os.popen('/bin/ps --noheaders -u {0} -o pid,ppid,size,rss,vsize,pcpu,args'.format(userid), 'r')
SUSPENDED = SUSPENDED + (1 if key == None else job_dict[jobid][key]) if job_dict[jobid]["job_state"] == "TIMEOUT": TIMEOUT = TIMEOUT + (1 if key == None else job_dict[jobid][key]) return "boot_fail={0},cancelled={1},completed={2},configuring={3},completing={4},deadline={5},failed={6},node_fail={7},oom={8},pending={9},preempted={10},running={11},resv_del_hold={12},requeue_fed={13},requeue_hold={14},requeued={15},resizing={16},revoked={17},signaling={18},special_exit={19},stage_out={20},stopped={21},suspended={22},timeout={23}".format( BOOT_FAIL, CANCELLED, COMPLETED, CONFIGURING, COMPLETING, DEADLINE, FAILED, NODE_FAIL, OUT_OF_MEMORY, PENDING, PREEMPTED, RUNNING, RESV_DEL_HOLD, REQUEUE_FED, REQUEUE_HOLD, REQUEUED, RESIZING, REVOKED, SIGNALING, SPECIAL_EXIT, STAGE_OUT, STOPPED, SUSPENDED, TIMEOUT) if __name__ == "__main__": try: rjob = pyslurm.job() jobs = rjob.get() if len(jobs) > 0: num_jobs = get_squeue(jobs) num_cpus = get_squeue(jobs, 'num_cpus') print("SQueue,metric=num_jobs,hostname={0} {1}".format( socket.gethostname(), num_jobs)) print("SQueue,metric=num_cpus,hostname={0} {1}".format( socket.gethostname(), num_cpus)) else: print("No jobs found !") except ValueError as e: print("Job query failed - {0}".format(e.args[0]))
def test_job_ids(): all_job_ids = pyslurm.job().ids() assert type(all_job_ids) is ListType
def test_job_get(): """Job: Test job().get() return type.""" all_jobs = pyslurm.job().get() assert_true(isinstance(all_jobs, dict))
def test_job_scontrol(): all_job_ids = pyslurm.job().ids() # Make sure job is running first test_job = all_job_ids[0] #assert type(test_job) is IntType test_job_info = pyslurm.job().find_id(str(test_job))[0] assert test_job == test_job_info["job_id"] scontrol = subprocess.Popen(["scontrol", "-d", "show", "job", str(test_job)], stdout=subprocess.PIPE).communicate() scontrol_stdout = scontrol[0].strip().split() scontrol_dict = {value.split("=")[0]: value.split("=")[1] for value in scontrol_stdout} #'Account': '(null)', #'AllocNode:Sid': 'sms:32207', assert test_job_info["batch_flag"] == int(scontrol_dict["BatchFlag"]) assert test_job_info["batch_host"] == scontrol_dict["BatchHost"] assert test_job_info["cpus_per_task"] == int(scontrol_dict["CPUs/Task"]) assert test_job_info["command"] == scontrol_dict["Command"] # This is a bool. assert test_job_info["contiguous"] == int(scontrol_dict["Contiguous"]) #'CoreSpec': '*', # 'Dependency': '(null)', # 'EligibleTime': '2016-03-31T00:25:32', # 'EndTime': '2016-04-01T00:25:33', # 'ExcNodeList': '(null)', assert test_job_info["exit_code"] == scontrol_dict["ExitCode"] # 'Features': '(null)', # 'Gres': '(null)', # 'GroupId': 'giovanni(1002)', assert test_job_info["job_id"] == int(scontrol_dict["JobId"]) assert test_job_info["name"] == scontrol_dict["JobName"] assert test_job_info["job_state"] == scontrol_dict["JobState"] # 'Licenses': '(null)', # 'MinCPUsNode': '1', # 'MinMemoryNode': '100M', # 'MinTmpDiskNode': '0', # Missing? # 'Network': '(null)', assert test_job_info["nice"] == int(scontrol_dict["Nice"]) # 'NodeList': 'c1', # 'NtasksPerN:B:S:C': '0:0:*:*', assert test_job_info["num_cpus"] == int(scontrol_dict["NumCPUs"]) assert test_job_info["num_nodes"] == int(scontrol_dict["NumNodes"]) assert test_job_info["partition"] == scontrol_dict["Partition"] # 'Power': '', # 'PreemptTime': 'None', assert test_job_info["priority"] == int(scontrol_dict["Priority"]) # 'QOS': '(null)', assert test_job_info["state_reason"] == scontrol_dict["Reason"] assert test_job_info["reboot"] == int(scontrol_dict["Reboot"]) # 'ReqB:S:C:T': '0:0:*:*', # 'ReqNodeList': '(null)', # This is another bool assert test_job_info["requeue"] == int(scontrol_dict["Requeue"]) # 'Reservation': '(null)', assert test_job_info["restart_cnt"] == int(scontrol_dict["Restarts"]) assert test_job_info["run_time_str"] == scontrol_dict["RunTime"] assert test_job_info["sicp_mode"] == int(scontrol_dict["SICP"]) # 'SecsPreSuspend': '0', assert test_job_info["shared"] == scontrol_dict["Shared"] # 'Shared': '0', # 'Socks/Node': '*', # 'StartTime': '2016-03-31T00:25:33', assert test_job_info["std_err"] == scontrol_dict["StdErr"] assert test_job_info["std_in"] == scontrol_dict["StdIn"] assert test_job_info["std_out"] == scontrol_dict["StdOut"] # 'SubmitTime': '2016-03-31T00:25:32', # 'SuspendTime': 'None', # 'TRES': 'cpu', # Missing ? assert test_job_info["time_limit_str"] == scontrol_dict["TimeLimit"] # 'TimeMin': 'N/A', # 'UserId': 'giovanni(1002)', assert test_job_info["work_dir"] == scontrol_dict["WorkDir"]
def test_job_ids(): """Job: Test job().ids() return type.""" all_job_ids = pyslurm.job().ids() assert_true(isinstance(all_job_ids, list))