Beispiel #1
0
    def submit(self,
               script_name,
               immediate=False,
               depends_on=None,
               depends_on_always_run=False):
        job_id = None

        output = subprocess.run(
            f"qsub -V -terse {script_name}",
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            shell=True,
        )
        if output.returncode == 0:
            job_id = 0
            out = output.stdout.decode("utf-8")
            try:
                job_id = int(out.readline().strip())
                return job_id
            except:
                raise SchedulerException("Error submitting job to SGE")
        else:
            raise SchedulerException(
                f"Error submitting job to SGE: {output.stderr.decode('utf-8')}"
            )
Beispiel #2
0
 def submit(
     self, script_name, immediate=False, depends_on=None, depends_on_always_run=False
 ):
     job_id = None
     if immediate:
         raise NotYetImplementedException("Immediate not yet implemented for LSF")
     if depends_on and depends_on_always_run:
         cmd = 'bsub -w "ended(%s)" < %s ' % (depends_on, script_name)
         with os.popen(cmd) as f:
             output = f.readline()
             try:
                 job_id = int(output.split(" ")[1].replace("<", "").replace(">", ""))
             except:
                 raise SchedulerException("Job submission failed: " + output)
     elif depends_on is not None:
         cmd = 'bsub -w "done(%s)" < %s ' % (depends_on, script_name)
         with os.popen(cmd) as f:
             output = f.readline()
             try:
                 job_id = int(output.split(" ")[1].replace("<", "").replace(">", ""))
             except:
                 raise SchedulerException("Job submission failed: " + output)
     else:
         with os.popen("bsub <" + script_name) as f:
             output = f.readline()
             try:
                 job_id = int(output.split(" ")[1].replace("<", "").replace(">", ""))
             except:
                 raise SchedulerException("Job submission failed: " + output)
     return job_id
Beispiel #3
0
    def submit(self,
               script_name,
               immediate=False,
               depends_on=None,
               depends_on_always_run=False):
        additional_cmd = ""
        if "MYCLUSTER_SUBMIT_OPT" in os.environ:
            additional_cmd = os.environ["MYCLUSTER_SUBMIT_OPT"]
        if not immediate:
            if depends_on and depends_on_always_run:
                command = f"sbatch {additional_cmd} --kill-on-invalid-dep=yes --dependency=afterany:{depends_on} {script_name}"
            elif depends_on is not None:
                command = f"sbatch {additional_cmd} --kill-on-invalid-dep=yes --dependency=afterok:{depends_on} {script_name}"
            else:
                command = f"sbatch {additional_cmd} {script_name}"
            print(f"running {command}")
            with os.popen(command) as f:
                output = f.readline()
                try:
                    job_id = int(output.split(" ")[-1].strip())
                except:
                    raise SchedulerException("Job submission failed: " +
                                             output)
        else:
            with os.popen('grep -- "SBATCH -p" ' + script_name +
                          " | sed 's/#SBATCH//'") as f:
                partition = f.readline().rstrip()
            with os.popen('grep -- "SBATCH --nodes" ' + script_name +
                          " | sed 's/#SBATCH//'") as f:
                nnodes = f.readline().rstrip()
            with os.popen('grep -- "SBATCH --ntasks" ' + script_name +
                          " | sed 's/#SBATCH//'") as f:
                ntasks = f.readline().rstrip()
            with os.popen('grep -- "SBATCH -A" ' + script_name +
                          " | sed 's/#SBATCH//'") as f:
                project = f.readline().rstrip()
            with os.popen('grep -- "SBATCH -J" ' + script_name +
                          " | sed 's/#SBATCH//'") as f:
                job = f.readline().rstrip()

            cmd_line = ("salloc --exclusive " + nnodes + " " + partition +
                        " " + ntasks + " " + project + " " + job + " bash ./" +
                        script_name)
            try:
                output = subprocess.check_output(cmd_line, shell=True)
                try:
                    job_id = int(output.split(" ")[-1].strip())
                except:
                    raise SchedulerException("Job submission failed: " +
                                             output)
            except:
                raise SchedulerException("Job submission failed: " + cmd_line)
        return job_id
Beispiel #4
0
 def node_config(self, queue_id):
     max_threads = 1
     max_memory = 1
     try:
         tpn = tasks_per_node(queue_id)
         vnode_type = vnode_type = vnode_type = self._get_vnode_name(
             queue_id)
         if vnode_type is not None:
             output = self._check_output(
                 "pbsnodes -a -F dsv | grep {}".format(vnode_type),
                 shell=True)
         for line in output.splitlines():
             for item in line.split("|"):
                 [key, value] = item.strip().split("=")
                 if key.strip() == "resources_available.vps_per_ppu":
                     if int(value) > max_threads:
                         max_threads = int(value) * tpn
                 if key.strip() == "resources_available.mem":
                     # strip kb and convert to mb
                     mem = float(value[:-2]) / 1024
                     if mem > max_memory:
                         max_memory = mem
     except Exception as e:
         raise SchedulerException("Error fetching node config")
     return {"max thread": max_threads, "max memory": max_memory}
Beispiel #5
0
 def delete(self, job_id):
     cmd = f"bkill {job_id}"
     output = subprocess.run(
         cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
     )
     if output.returncode != 0:
         raise SchedulerException(f"Error cancelling job {job_id}")
Beispiel #6
0
    def tasks_per_node(self, queue_id):
        parallel_env = queue_id.split(":")[0]
        queue_name = queue_id.split(":")[1]
        tasks = 0
        with os.popen("qconf -sq " + queue_name) as f:
            for line in f:
                if line.split(" ")[0] == "slots":
                    tasks = int(re.split("\W+", line)[1])

        pe_tasks = tasks
        with os.popen("qconf -sp " + parallel_env) as f:
            try:
                for line in f:
                    if line.split(" ")[0] == "allocation_rule":
                        try:
                            # This may throw exception as allocation rule
                            # may not always be an integer
                            pe_tasks = int(re.split("\W+", line)[1])
                        except ValueError as e:
                            raise SchedulerException(
                                "Error parsing SGE output")
            except:
                pass

        return min(tasks, pe_tasks)
Beispiel #7
0
 def _get_vnode_name(self, queue_id):
     try:
         output = self._check_output(["qstat", "-Qf", queue_id])
         for line in output.splitlines():
             if line.strip().startswith("default_chunk.vntype"):
                 return line.split("=")[-1].strip()
     except Exception as e:
         raise SchedulerException("Error fetching node config")
Beispiel #8
0
 def queues(self):
     queue_list = []
     try:
         output = self._check_output(["qstat", "-Q"])
         lines = output.splitlines()[2:]
         for queue in lines:
             queue_list.append(queue.split(" ")[0])
     except Exception as e:
         raise SchedulerException("Error fetching queues")
     return queue_list
Beispiel #9
0
 def submit(self,
            script_name,
            immediate=False,
            depends_on=None,
            depends_on_always_run=False):
     job_id = None
     if not immediate:
         if depends_on and depends_on_always_run:
             with os.popen("qsub -W depend=afterany:%s %s" %
                           (depends_on, script_name)) as f:
                 output = f.readline()
                 try:
                     job_id = output.strip().split(".")[0]
                 except:
                     raise SchedulerException("Job submission failed: " +
                                              output)
         elif depends_on is not None:
             with os.popen("qsub -W depend=afterok:%s %s" %
                           (depends_on, script_name)) as f:
                 output = f.readline()
                 try:
                     job_id = output.strip().split(".")[0]
                 except:
                     raise SchedulerException("Job submission failed: " +
                                              output)
         else:
             with os.popen("qsub " + script_name) as f:
                 output = f.readline()
                 try:
                     job_id = output.strip().split(".")[0]
                 except:
                     raise SchedulerException("Job submission failed: " +
                                              output)
     else:
         raise NotYetImplementedException(
             "Immediate not yet implemented for PBS")
     return job_id
Beispiel #10
0
 def tasks_per_node(self, queue_id):
     tpn = 1
     try:
         vnode_type = vnode_type = self._get_vnode_name(queue_id)
         if vnode_type is not None:
             output = self._check_output(
                 "pbsnodes -a -F dsv | grep {}".format(vnode_type),
                 shell=True)
             for line in output.splitlines():
                 for item in line.split("|"):
                     [key, value] = item.strip().split("=")
                     if key.strip() == "resources_available.ncpus":
                         if int(value) > tpn:
                             tpn = int(value)
     except Exception as e:
         raise SchedulerException("Error fetching node config")
     return tpn
Beispiel #11
0
 def node_config(self, queue_id):
     # Find first node with queue and record node config
     queue_name = queue_id
     tasks = 0
     config = {}
     with os.popen("sinfo -Nelh -p " + queue_name) as f:
         line = f.readline()
         if len(line):
             new_line = re.sub(" +", " ", line.strip())
             tasks = int(new_line.split(" ")[4])
             memory = int(new_line.split(" ")[6])
             config["max task"] = tasks
             config["max thread"] = tasks
             config["max memory"] = memory
         else:
             raise SchedulerException(
                 "Requested partition %s has no nodes" % queue_name)
     return config
Beispiel #12
0
 def get_job_details(self, job_id):
     """
     Get full job and step stats for job_id
     """
     stats_dict = {}
     with os.popen(
         "bjobs -o \"jobid run_time cpu_used  queue slots  stat exit_code start_time estimated_start_time finish_time delimiter='|'\" -noheader "
         + str(job_id)
     ) as f:
         try:
             line = f.readline()
             cols = line.split("|")
             stats_dict["job_id"] = cols[0]
             if cols[1] != "-":
                 stats_dict["wallclock"] = datetime.timedelta(
                     seconds=float(cols[1].split(" ")[0])
                 )
             if cols[2] != "-":
                 stats_dict["cpu"] = datetime.timedelta(seconds=float(cols[2].split(" ")[0]))
             stats_dict["queue"] = cols[3]
             stats_dict["status"] = cols[5]
             stats_dict["exit_code"] = cols[6]
             stats_dict["start"] = cols[7]
             stats_dict["start_time"] = cols[8]
             if stats_dict["status"] in ["DONE", "EXIT"]:
                 stats_dict["end"] = cols[9]
         except:
             with os.popen("bhist -l " + str(job_id)) as f:
                 try:
                     output = f.readlines()
                     for line in output:
                         if "Done successfully" in line:
                             stats_dict["status"] = "DONE"
                             return stats_dict
                         elif "Completed <exit>" in line:
                             stats_dict["status"] = "EXIT"
                             return stats_dict
                         else:
                             stats_dict["status"] = "UNKNOWN"
                 except Exception as e:
                     raise SchedulerException(
                         "Error fetching job details from bjobs, check job id."
                     )
     return stats_dict
Beispiel #13
0
 def list_current_jobs(self):
     jobs = []
     output = subprocess.run(
         "squeue -h -u `whoami`",
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         shell=True,
     )
     if output.returncode == 0:
         for line in output.stdout.decode("utf-8").splitlines():
             job_info = re.sub(" +", " ", line.strip()).split(" ")
             jobs.append({
                 "id": int(job_info[0]),
                 "queue": job_info[1],
                 "name": job_info[2],
                 "state": job_info[4],
             })
     else:
         raise SchedulerException("Error fetching job queue listing")
     return jobs
Beispiel #14
0
 def available_tasks(self, queue_id):
     free_tasks = 0
     max_tasks = 0
     assigned_tasks = 0
     try:
         vnode_type = self._get_vnode_name(queue_id)
         if vnode_type is not None:
             output = self._check_output(
                 "pbsnodes -a -F dsv | grep {}".format(vnode_type),
                 shell=True)
         for line in output.splitlines():
             for item in line.split("|"):
                 [key, value] = item.strip().split("=")
                 if key.strip() == "resources_available.ncpus":
                     max_tasks += int(value)
                 elif key.strip() == "resources_assigned.ncpus":
                     assigned_tasks += int(value)
         free_tasks = max_tasks - assigned_tasks
     except Exception as e:
         raise SchedulerException("Error fetching node config")
     return {"available": free_tasks, "max tasks": max_tasks}
Beispiel #15
0
 def list_current_jobs(self):
     jobs = []
     output = subprocess.run(
         'bjobs -noheader -u `whoami` -o "jobid queue job_name stat"',
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         shell=True,
     )
     if output.returncode == 0:
         for line in output.stdout.decode("utf-8").splitlines():
             if line == "No unfinished job found":
                 return jobs
             job_info = re.sub(" +", " ", line.strip()).split(" ")
             jobs.append(
                 {
                     "id": int(job_info[0]),
                     "queue": job_info[1],
                     "name": job_info[2],
                     "state": job_info[3],
                 }
             )
     else:
         raise SchedulerException("Error fetching job queue listing")
     return jobs
Beispiel #16
0
 def get_job_details(self, job_id):
     """
     Get full job and step stats for job_id
     First check using sacct, then fallback to squeue
     """
     stats_dict = {}
     sacct_cmd = f"sacct --noheader --format JobId,Elapsed,TotalCPU,Partition,NTasks,AveRSS,State,ExitCode,start,end -P -j {job_id}"
     squeue_cmd = f'squeue --format "%.18i %.9P %.8j %.8u %.2t %.10M %.6D %R %S" -h -j {job_id}'
     output = subprocess.run(sacct_cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             shell=True)
     if output.returncode != 0:
         raise SchedulerException("Error fetching job details from sacct")
     lines = output.stdout.decode("utf-8").splitlines()
     if len(lines) != 0:
         if lines[0] not in [
                 "SLURM accounting storage is disabled",
                 "slurm_load_jobs error: Invalid job id specified",
         ]:
             cols = lines[0].split("|")
             stats_dict["job_id"] = cols[0]
             stats_dict["wallclock"] = self._get_timedelta(cols[1])
             stats_dict["cpu"] = self._get_timedelta(cols[2])
             stats_dict["queue"] = cols[3]
             stats_dict["status"] = cols[6]
             stats_dict["exit_code"] = cols[7].split(":")[0]
             stats_dict["start"] = cols[8]
             stats_dict["end"] = cols[9]
             steps = []
             for line in lines[1:]:
                 step = {}
                 cols = line.split("|")
                 step_val = cols[0].split(".")[1]
                 step["step"] = step_val
                 step["wallclock"] = self._get_timedelta(cols[1])
                 step["cpu"] = self._get_timedelta(cols[2])
                 step["ntasks"] = cols[4]
                 step["status"] = cols[6]
                 step["exit_code"] = cols[7].split(":")[0]
                 step["start"] = cols[8]
                 step["end"] = cols[9]
                 steps.append(step)
             stats_dict["steps"] = steps
     else:
         output = subprocess.run(squeue_cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 shell=True)
         if output.returncode != 0:
             raise SchedulerException(
                 "Error fetching job details from squeue, check job id.")
         lines = output.stdout.decode("utf-8").splitlines()
         for line in lines:
             if line == "slurm_load_jobs error: Invalid job id specified":
                 raise SchedulerException("Invalid job id specified")
             new_line = re.sub(" +", " ", line.strip())
             job_id = int(new_line.split(" ")[0])
             state = new_line.split(" ")[4]
             stats_dict["job_id"] = str(job_id)
             stats_dict["status"] = state
             stats_dict["start_time"] = new_line.split(" ")[8]
     return stats_dict