def submit(self, script_name, immediate=False, depends_on=None, depends_on_always_run=False): job_id = None output = subprocess.run( f"qsub -V -terse {script_name}", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, ) if output.returncode == 0: job_id = 0 out = output.stdout.decode("utf-8") try: job_id = int(out.readline().strip()) return job_id except: raise SchedulerException("Error submitting job to SGE") else: raise SchedulerException( f"Error submitting job to SGE: {output.stderr.decode('utf-8')}" )
def submit( self, script_name, immediate=False, depends_on=None, depends_on_always_run=False ): job_id = None if immediate: raise NotYetImplementedException("Immediate not yet implemented for LSF") if depends_on and depends_on_always_run: cmd = 'bsub -w "ended(%s)" < %s ' % (depends_on, script_name) with os.popen(cmd) as f: output = f.readline() try: job_id = int(output.split(" ")[1].replace("<", "").replace(">", "")) except: raise SchedulerException("Job submission failed: " + output) elif depends_on is not None: cmd = 'bsub -w "done(%s)" < %s ' % (depends_on, script_name) with os.popen(cmd) as f: output = f.readline() try: job_id = int(output.split(" ")[1].replace("<", "").replace(">", "")) except: raise SchedulerException("Job submission failed: " + output) else: with os.popen("bsub <" + script_name) as f: output = f.readline() try: job_id = int(output.split(" ")[1].replace("<", "").replace(">", "")) except: raise SchedulerException("Job submission failed: " + output) return job_id
def submit(self, script_name, immediate=False, depends_on=None, depends_on_always_run=False): additional_cmd = "" if "MYCLUSTER_SUBMIT_OPT" in os.environ: additional_cmd = os.environ["MYCLUSTER_SUBMIT_OPT"] if not immediate: if depends_on and depends_on_always_run: command = f"sbatch {additional_cmd} --kill-on-invalid-dep=yes --dependency=afterany:{depends_on} {script_name}" elif depends_on is not None: command = f"sbatch {additional_cmd} --kill-on-invalid-dep=yes --dependency=afterok:{depends_on} {script_name}" else: command = f"sbatch {additional_cmd} {script_name}" print(f"running {command}") with os.popen(command) as f: output = f.readline() try: job_id = int(output.split(" ")[-1].strip()) except: raise SchedulerException("Job submission failed: " + output) else: with os.popen('grep -- "SBATCH -p" ' + script_name + " | sed 's/#SBATCH//'") as f: partition = f.readline().rstrip() with os.popen('grep -- "SBATCH --nodes" ' + script_name + " | sed 's/#SBATCH//'") as f: nnodes = f.readline().rstrip() with os.popen('grep -- "SBATCH --ntasks" ' + script_name + " | sed 's/#SBATCH//'") as f: ntasks = f.readline().rstrip() with os.popen('grep -- "SBATCH -A" ' + script_name + " | sed 's/#SBATCH//'") as f: project = f.readline().rstrip() with os.popen('grep -- "SBATCH -J" ' + script_name + " | sed 's/#SBATCH//'") as f: job = f.readline().rstrip() cmd_line = ("salloc --exclusive " + nnodes + " " + partition + " " + ntasks + " " + project + " " + job + " bash ./" + script_name) try: output = subprocess.check_output(cmd_line, shell=True) try: job_id = int(output.split(" ")[-1].strip()) except: raise SchedulerException("Job submission failed: " + output) except: raise SchedulerException("Job submission failed: " + cmd_line) return job_id
def node_config(self, queue_id): max_threads = 1 max_memory = 1 try: tpn = tasks_per_node(queue_id) vnode_type = vnode_type = vnode_type = self._get_vnode_name( queue_id) if vnode_type is not None: output = self._check_output( "pbsnodes -a -F dsv | grep {}".format(vnode_type), shell=True) for line in output.splitlines(): for item in line.split("|"): [key, value] = item.strip().split("=") if key.strip() == "resources_available.vps_per_ppu": if int(value) > max_threads: max_threads = int(value) * tpn if key.strip() == "resources_available.mem": # strip kb and convert to mb mem = float(value[:-2]) / 1024 if mem > max_memory: max_memory = mem except Exception as e: raise SchedulerException("Error fetching node config") return {"max thread": max_threads, "max memory": max_memory}
def delete(self, job_id): cmd = f"bkill {job_id}" output = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True ) if output.returncode != 0: raise SchedulerException(f"Error cancelling job {job_id}")
def tasks_per_node(self, queue_id): parallel_env = queue_id.split(":")[0] queue_name = queue_id.split(":")[1] tasks = 0 with os.popen("qconf -sq " + queue_name) as f: for line in f: if line.split(" ")[0] == "slots": tasks = int(re.split("\W+", line)[1]) pe_tasks = tasks with os.popen("qconf -sp " + parallel_env) as f: try: for line in f: if line.split(" ")[0] == "allocation_rule": try: # This may throw exception as allocation rule # may not always be an integer pe_tasks = int(re.split("\W+", line)[1]) except ValueError as e: raise SchedulerException( "Error parsing SGE output") except: pass return min(tasks, pe_tasks)
def _get_vnode_name(self, queue_id): try: output = self._check_output(["qstat", "-Qf", queue_id]) for line in output.splitlines(): if line.strip().startswith("default_chunk.vntype"): return line.split("=")[-1].strip() except Exception as e: raise SchedulerException("Error fetching node config")
def queues(self): queue_list = [] try: output = self._check_output(["qstat", "-Q"]) lines = output.splitlines()[2:] for queue in lines: queue_list.append(queue.split(" ")[0]) except Exception as e: raise SchedulerException("Error fetching queues") return queue_list
def submit(self, script_name, immediate=False, depends_on=None, depends_on_always_run=False): job_id = None if not immediate: if depends_on and depends_on_always_run: with os.popen("qsub -W depend=afterany:%s %s" % (depends_on, script_name)) as f: output = f.readline() try: job_id = output.strip().split(".")[0] except: raise SchedulerException("Job submission failed: " + output) elif depends_on is not None: with os.popen("qsub -W depend=afterok:%s %s" % (depends_on, script_name)) as f: output = f.readline() try: job_id = output.strip().split(".")[0] except: raise SchedulerException("Job submission failed: " + output) else: with os.popen("qsub " + script_name) as f: output = f.readline() try: job_id = output.strip().split(".")[0] except: raise SchedulerException("Job submission failed: " + output) else: raise NotYetImplementedException( "Immediate not yet implemented for PBS") return job_id
def tasks_per_node(self, queue_id): tpn = 1 try: vnode_type = vnode_type = self._get_vnode_name(queue_id) if vnode_type is not None: output = self._check_output( "pbsnodes -a -F dsv | grep {}".format(vnode_type), shell=True) for line in output.splitlines(): for item in line.split("|"): [key, value] = item.strip().split("=") if key.strip() == "resources_available.ncpus": if int(value) > tpn: tpn = int(value) except Exception as e: raise SchedulerException("Error fetching node config") return tpn
def node_config(self, queue_id): # Find first node with queue and record node config queue_name = queue_id tasks = 0 config = {} with os.popen("sinfo -Nelh -p " + queue_name) as f: line = f.readline() if len(line): new_line = re.sub(" +", " ", line.strip()) tasks = int(new_line.split(" ")[4]) memory = int(new_line.split(" ")[6]) config["max task"] = tasks config["max thread"] = tasks config["max memory"] = memory else: raise SchedulerException( "Requested partition %s has no nodes" % queue_name) return config
def get_job_details(self, job_id): """ Get full job and step stats for job_id """ stats_dict = {} with os.popen( "bjobs -o \"jobid run_time cpu_used queue slots stat exit_code start_time estimated_start_time finish_time delimiter='|'\" -noheader " + str(job_id) ) as f: try: line = f.readline() cols = line.split("|") stats_dict["job_id"] = cols[0] if cols[1] != "-": stats_dict["wallclock"] = datetime.timedelta( seconds=float(cols[1].split(" ")[0]) ) if cols[2] != "-": stats_dict["cpu"] = datetime.timedelta(seconds=float(cols[2].split(" ")[0])) stats_dict["queue"] = cols[3] stats_dict["status"] = cols[5] stats_dict["exit_code"] = cols[6] stats_dict["start"] = cols[7] stats_dict["start_time"] = cols[8] if stats_dict["status"] in ["DONE", "EXIT"]: stats_dict["end"] = cols[9] except: with os.popen("bhist -l " + str(job_id)) as f: try: output = f.readlines() for line in output: if "Done successfully" in line: stats_dict["status"] = "DONE" return stats_dict elif "Completed <exit>" in line: stats_dict["status"] = "EXIT" return stats_dict else: stats_dict["status"] = "UNKNOWN" except Exception as e: raise SchedulerException( "Error fetching job details from bjobs, check job id." ) return stats_dict
def list_current_jobs(self): jobs = [] output = subprocess.run( "squeue -h -u `whoami`", stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, ) if output.returncode == 0: for line in output.stdout.decode("utf-8").splitlines(): job_info = re.sub(" +", " ", line.strip()).split(" ") jobs.append({ "id": int(job_info[0]), "queue": job_info[1], "name": job_info[2], "state": job_info[4], }) else: raise SchedulerException("Error fetching job queue listing") return jobs
def available_tasks(self, queue_id): free_tasks = 0 max_tasks = 0 assigned_tasks = 0 try: vnode_type = self._get_vnode_name(queue_id) if vnode_type is not None: output = self._check_output( "pbsnodes -a -F dsv | grep {}".format(vnode_type), shell=True) for line in output.splitlines(): for item in line.split("|"): [key, value] = item.strip().split("=") if key.strip() == "resources_available.ncpus": max_tasks += int(value) elif key.strip() == "resources_assigned.ncpus": assigned_tasks += int(value) free_tasks = max_tasks - assigned_tasks except Exception as e: raise SchedulerException("Error fetching node config") return {"available": free_tasks, "max tasks": max_tasks}
def list_current_jobs(self): jobs = [] output = subprocess.run( 'bjobs -noheader -u `whoami` -o "jobid queue job_name stat"', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, ) if output.returncode == 0: for line in output.stdout.decode("utf-8").splitlines(): if line == "No unfinished job found": return jobs job_info = re.sub(" +", " ", line.strip()).split(" ") jobs.append( { "id": int(job_info[0]), "queue": job_info[1], "name": job_info[2], "state": job_info[3], } ) else: raise SchedulerException("Error fetching job queue listing") return jobs
def get_job_details(self, job_id): """ Get full job and step stats for job_id First check using sacct, then fallback to squeue """ stats_dict = {} sacct_cmd = f"sacct --noheader --format JobId,Elapsed,TotalCPU,Partition,NTasks,AveRSS,State,ExitCode,start,end -P -j {job_id}" squeue_cmd = f'squeue --format "%.18i %.9P %.8j %.8u %.2t %.10M %.6D %R %S" -h -j {job_id}' output = subprocess.run(sacct_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) if output.returncode != 0: raise SchedulerException("Error fetching job details from sacct") lines = output.stdout.decode("utf-8").splitlines() if len(lines) != 0: if lines[0] not in [ "SLURM accounting storage is disabled", "slurm_load_jobs error: Invalid job id specified", ]: cols = lines[0].split("|") stats_dict["job_id"] = cols[0] stats_dict["wallclock"] = self._get_timedelta(cols[1]) stats_dict["cpu"] = self._get_timedelta(cols[2]) stats_dict["queue"] = cols[3] stats_dict["status"] = cols[6] stats_dict["exit_code"] = cols[7].split(":")[0] stats_dict["start"] = cols[8] stats_dict["end"] = cols[9] steps = [] for line in lines[1:]: step = {} cols = line.split("|") step_val = cols[0].split(".")[1] step["step"] = step_val step["wallclock"] = self._get_timedelta(cols[1]) step["cpu"] = self._get_timedelta(cols[2]) step["ntasks"] = cols[4] step["status"] = cols[6] step["exit_code"] = cols[7].split(":")[0] step["start"] = cols[8] step["end"] = cols[9] steps.append(step) stats_dict["steps"] = steps else: output = subprocess.run(squeue_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) if output.returncode != 0: raise SchedulerException( "Error fetching job details from squeue, check job id.") lines = output.stdout.decode("utf-8").splitlines() for line in lines: if line == "slurm_load_jobs error: Invalid job id specified": raise SchedulerException("Invalid job id specified") new_line = re.sub(" +", " ", line.strip()) job_id = int(new_line.split(" ")[0]) state = new_line.split(" ")[4] stats_dict["job_id"] = str(job_id) stats_dict["status"] = state stats_dict["start_time"] = new_line.split(" ")[8] return stats_dict