コード例 #1
0
 def obtainSystemConstants(cls):
     # expect qhost output is in the form:
     # HOSTNAME                ARCH         NCPU NSOC NCOR NTHR NLOAD  MEMTOT  MEMUSE  SWAPTO  SWAPUS
     # ----------------------------------------------------------------------------------------------
     # global                  -               -    -    -    -     -       -       -       -       -
     # compute-1-1             lx-amd64       72    2   36   72  0.49  188.8G   79.6G   92.7G   19.2G
     # compute-1-10            lx-amd64       72    2   36   72  0.22  188.8G   51.1G   92.7G    2.8G
     lines = call_command(["qhost"]).strip().split('\n')
     items = lines[0].strip().split()
     num_columns = len(items)
     cpu_index = None
     mem_index = None
     for i in range(num_columns):
         if items[i] == 'NCPU':
             cpu_index = i
         elif items[i] == 'MEMTOT':
             mem_index = i
     if cpu_index is None or mem_index is None:
         raise RuntimeError('qhost command does not return NCPU or MEMTOT columns')
     maxCPU = 0
     maxMEM = MemoryString("0")
     for line in lines[2:]:
         items = line.strip().split()
         if len(items) < num_columns:
             raise RuntimeError('qhost output has a varying number of columns')
         if items[cpu_index] != '-' and int(items[cpu_index]) > maxCPU:
             maxCPU = int(items[cpu_index])
         if items[mem_index] != '-' and MemoryString(items[mem_index]) > maxMEM:
             maxMEM = MemoryString(items[mem_index])
     if maxCPU is 0 or maxMEM is 0:
         raise RuntimeError('qhost returned null NCPU or MEMTOT info')
     return maxCPU, maxMEM
コード例 #2
0
ファイル: lsf.py プロジェクト: tmooney/toil
        def getJobExitCode(self, lsfJobID):
            # the task is set as part of the job ID if using getBatchSystemID()
            if "NOT_SUBMITTED" in lsfJobID:
                logger.error("bjobs detected job failed to submit")
                return 1

            job, task = (lsfJobID, None)
            if '.' in lsfJobID:
                job, task = lsfJobID.split('.', 1)

            self.parseMaxMem(job)
            # first try bjobs to find out job state
            if check_lsf_json_output_supported:
                args = [
                    "bjobs", "-json", "-o",
                    "user exit_code stat exit_reason pend_reason",
                    str(job)
                ]
                logger.debug("Checking job exit code for job via bjobs: "
                             "{}".format(job))
                stdout = call_command(args)
                bjobs_records = self.parseBjobs(stdout)
                if bjobs_records:
                    process_output = bjobs_records[0]
                    return self.parse_bjobs_record(process_output, job)

            return self.fallbackGetJobExitCode(job)
コード例 #3
0
ファイル: torque.py プロジェクト: mr-c/toil
        def getJobExitCode(self, torqueJobID):
            if self._version == "pro":
                args = ["qstat", "-x", "-f", str(torqueJobID).split('.')[0]]
            elif self._version == "oss":
                args = ["qstat", "-f", str(torqueJobID).split('.')[0]]

            stdout = call_command(args)
            for line in stdout.split('\n'):
                line = line.strip()
                # Case differences due to PBSPro vs OSS Torque qstat outputs
                if line.startswith("failed") or line.startswith(
                        "FAILED") and int(line.split()[1]) == 1:
                    return 1
                if line.startswith("exit_status") or line.startswith(
                        "Exit_status"):
                    status = line.split(' = ')[1]
                    logger.debug('Exit Status: ' + status)
                    return int(status)
                if 'unknown job id' in line.lower():
                    # some clusters configure Torque to forget everything about just
                    # finished jobs instantly, apparently for performance reasons
                    logger.debug(
                        'Batch system no longer remembers about job {}'.format(
                            torqueJobID))
                    # return assumed success; status files should reveal failure
                    return 0
            return None
コード例 #4
0
        def _getJobDetailsFromSacct(self, slurmJobID):
            # SLURM job exit codes are obtained by running sacct.
            args = ['sacct',
                    '-n', # no header
                    '-j', str(slurmJobID), # job
                    '--format', 'State,ExitCode', # specify output columns
                    '-P', # separate columns with pipes
                    '-S', '1970-01-01'] # override start time limit

            stdout = call_command(args)
            for line in stdout.split('\n'):
                logger.debug("%s output %s", args[0], line)
                values = line.strip().split('|')
                if len(values) < 2:
                    continue
                state, exitcode = values
                logger.debug("sacct job state is %s", state)
                # If Job is in a running state, return None to indicate we don't have an update
                status, signal = [int(n) for n in exitcode.split(':')]
                if signal > 0:
                    # A non-zero signal may indicate e.g. an out-of-memory killed job
                    status = 128 + signal
                logger.debug("sacct exit code is %s, returning status %d", exitcode, status)
                return state, status
            logger.debug("Did not find exit code for job in sacct output")
            return None, None
コード例 #5
0
ファイル: lsf.py プロジェクト: rahul-yadav-supra/toil
 def submitJob(self, subLine):
     combinedEnv = self.boss.environment
     combinedEnv.update(os.environ)
     stdout = call_command(subLine, env=combinedEnv)
     line = stdout.split('\n')[0]
     result = int(line.strip().split()[1].strip('<>'))
     logger.debug("Got the job id: {}".format(result))
     return result
コード例 #6
0
ファイル: torque.py プロジェクト: mpcusack-color/toil
        def getRunningJobIDs(self):
            times = {}
            with self.runningJobsLock:
                currentjobs = dict((str(self.batchJobIDs[x][0].strip()), x)
                                   for x in self.runningJobs)
            logger.debug("getRunningJobIDs current jobs are: " +
                         str(currentjobs))
            # Skip running qstat if we don't have any current jobs
            if not currentjobs:
                return times
            # Only query for job IDs to avoid clogging the batch system on heavily loaded clusters
            # PBS plain qstat will return every running job on the system.
            jobids = sorted(list(currentjobs.keys()))
            if self._version == "pro":
                stdout = call_command(['qstat', '-x'] + jobids)
            elif self._version == "oss":
                stdout = call_command(['qstat'] + jobids)

            # qstat supports XML output which is more comprehensive, but PBSPro does not support it
            # so instead we stick with plain commandline qstat tabular outputs
            for currline in stdout.split('\n'):
                items = currline.strip().split()
                if items:
                    jobid = items[0].strip()
                    if jobid in currentjobs:
                        logger.debug("getRunningJobIDs job status for is: " +
                                     items[4])
                    if jobid in currentjobs and items[4] == 'R':
                        walltime = items[3]
                        logger.debug(
                            "getRunningJobIDs qstat reported walltime is: " +
                            walltime)
                        # normal qstat has a quirk with job time where it reports '0'
                        # when initially running; this catches this case
                        if walltime == '0':
                            walltime = time.mktime(
                                time.strptime(walltime, "%S"))
                        else:
                            walltime = time.mktime(
                                time.strptime(walltime, "%H:%M:%S"))
                        times[currentjobs[jobid]] = walltime

            logger.debug("Job times from qstat are: " + str(times))
            return times
コード例 #7
0
ファイル: slurm.py プロジェクト: rahul-yadav-supra/toil
 def submitJob(self, subLine):
     try:
         output = call_command(subLine)
         # sbatch prints a line like 'Submitted batch job 2954103'
         result = int(output.strip().split()[-1])
         logger.debug("sbatch submitted job %d", result)
         return result
     except OSError as e:
         logger.error("sbatch command failed")
         raise e
コード例 #8
0
 def fallbackRunningJobIDs(self, currentjobs):
     times = {}
     stdout = call_command(["bjobs", "-o", "jobid stat start_time delimiter='|'"])
     for curline in stdout.split('\n'):
         items = curline.strip().split('|')
         if items[0] in currentjobs and items[1] == 'RUN':
             jobstart = parse(items[2], default=datetime.now(tzlocal()))
             times[currentjobs[items[0]]] = datetime.now(tzlocal()) \
                 - jobstart
     return times
コード例 #9
0
ファイル: slurm.py プロジェクト: tmooney/toil
        def _getJobDetailsFromSacct(self, job_id_list: list) -> dict:
            """
            Get SLURM job exit codes for the jobs in `job_id_list` by running `sacct`.
            :param job_id_list: list of integer batch job IDs.
            :return: dict of job statuses, where key is the job-id, and value is a tuple
            containing the job's state and exit code.
            """
            job_ids = ",".join(str(id) for id in job_id_list)
            args = [
                'sacct',
                '-n',  # no header
                '-j',
                job_ids,  # job
                '--format',
                'JobIDRaw,State,ExitCode',  # specify output columns
                '-P',  # separate columns with pipes
                '-S',
                '1970-01-01'
            ]  # override start time limit
            stdout = call_command(args)

            # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
            # job state and exit status. Initialize dict before processing output of `sacct`.
            job_statuses = {}
            for job_id in job_id_list:
                job_statuses[job_id] = (None, None)

            for line in stdout.splitlines():
                #logger.debug("%s output %s", args[0], line)
                values = line.strip().split('|')
                if len(values) < 3:
                    continue
                job_id_raw, state, exitcode = values
                logger.debug("%s state of job %s is %s", args[0], job_id_raw,
                             state)
                # JobIDRaw is in the form JobID[.JobStep]; we're not interested in job steps.
                job_id_parts = job_id_raw.split(".")
                if len(job_id_parts) > 1:
                    continue
                job_id = int(job_id_parts[0])
                status, signal = [int(n) for n in exitcode.split(':')]
                if signal > 0:
                    # A non-zero signal may indicate e.g. an out-of-memory killed job
                    status = 128 + signal
                logger.debug("%s exit code of job %d is %s, return status %d",
                             args[0], job_id, exitcode, status)
                job_statuses[job_id] = state, status
            logger.debug("%s returning job statuses: %s", args[0],
                         job_statuses)
            return job_statuses
コード例 #10
0
        def getRunningJobIDs(self):
            times = {}
            with self.runningJobsLock:
                currentjobs = dict((str(self.batchJobIDs[x][0]), x) for x in self.runningJobs)
            stdout = call_command(["qstat"])

            for currline in stdout.split('\n'):
                items = currline.strip().split()
                if items:
                    if items[0] in currentjobs and items[4] == 'r':
                        jobstart = " ".join(items[5:7])
                        jobstart = time.mktime(time.strptime(jobstart, "%m/%d/%Y %H:%M:%S"))
                        times[currentjobs[items[0]]] = time.time() - jobstart

            return times
コード例 #11
0
ファイル: slurm.py プロジェクト: tmooney/toil
 def getWaitDuration(cls):
     # Extract the slurm batchsystem config for the appropriate value
     lines = call_command(['scontrol', 'show', 'config']).split('\n')
     time_value_list = []
     for line in lines:
         values = line.split()
         if len(values) > 0 and (values[0] == "SchedulerTimeSlice"
                                 or values[0] == "AcctGatherNodeFreq"):
             time_name = values[values.index('=') + 1:][1]
             time_value = int(values[values.index('=') + 1:][0])
             if time_name == 'min':
                 time_value *= 60
             # Add a 20% ceiling on the wait duration relative to the scheduler update duration
             time_value_list.append(math.ceil(time_value * 1.2))
     return max(time_value_list)
コード例 #12
0
ファイル: lsf.py プロジェクト: rahul-yadav-supra/toil
        def getRunningJobIDs(self):
            times = {}
            with self.runningJobsLock:
                currentjobs = dict(
                    (str(self.batchJobIDs[x][0]), x) for x in self.runningJobs)

            stdout = call_command(
                ["bjobs", "-o", "jobid stat start_time delimiter='|'"])
            for curline in stdout.split('\n'):
                items = curline.strip().split('|')
                if items[0] in currentjobs and items[1] == 'RUN':
                    jobstart = parse(items[2], default=datetime.now(tzlocal()))
                    times[currentjobs[items[0]]] = datetime.now(tzlocal()) \
                        - jobstart
            return times
コード例 #13
0
ファイル: torque.py プロジェクト: carissafletcher/toil
        def _pbsVersion(self):
            """ Determines PBS/Torque version via pbsnodes
            """
            try:
                out = call_command(["pbsnodes", "--version"])
                if "PBSPro" in out:
                     logger.debug("PBS Pro proprietary Torque version detected")
                     self._version = "pro"
                else:
                     logger.debug("Torque OSS version detected")
                     self._version = "oss"
            except CalledProcessErrorStderr as e:
               if e.returncode != 0:
                    logger.error("Could not determine PBS/Torque version")

            return self._version
コード例 #14
0
        def submitJob(self, subLine):
            combinedEnv = self.boss.environment
            combinedEnv.update(os.environ)
            stdout = call_command(subLine, env=combinedEnv)
            # Example success: Job <39605914> is submitted to default queue <general>.
            # Example fail: Service class does not exist. Job not submitted.
            result_search = re.search('Job <(.*)> is submitted', stdout)

            if result_search:
                result = int(result_search.group(1))
                logger.debug("Got the job id: {}".format(result))
            else:
                logger.error("Could not submit job\nReason: {}".format(stdout))
                temp_id = randint(10000000, 99999999)
                #Flag this job to be handled by getJobExitCode
                result = "NOT_SUBMITTED_{}".format(temp_id)
            return result
コード例 #15
0
        def _getJobDetailsFromScontrol(self, slurmJobID):
            args = ['scontrol',
                    'show',
                    'job',
                    str(slurmJobID)]

            stdout = call_command(args)
            if isinstance(stdout, str):
                values = stdout.strip().split()
            elif isinstance(stdout, bytes):
                values = stdout.decode('utf-8').strip().split()

            # If job information is not available an error is issued:
            # slurm_load_jobs error: Invalid job id specified
            # There is no job information, so exit.
            if len(values) > 0 and values[0] == 'slurm_load_jobs':
                return (None, None)

            job = dict()
            for item in values:
                logger.debug(f"{args[0]} output {item}")

                # Output is in the form of many key=value pairs, multiple pairs on each line
                # and multiple lines in the output. Each pair is pulled out of each line and
                # added to a dictionary
                for v in values:
                    bits = v.split('=')
                    job[bits[0]] = bits[1]

            state = job['JobState']
            try:
                exitcode = job['ExitCode']
                if exitcode is not None:
                    status, signal = [int(n) for n in exitcode.split(':')]
                    if signal > 0:
                        # A non-zero signal may indicate e.g. an out-of-memory killed job
                        status = 128 + signal
                    logger.debug("scontrol exit code is %s, returning status %d", exitcode, status)
                    rc = status
                else:
                    rc = None
            except KeyError:
                rc = None

            return state, rc
コード例 #16
0
ファイル: slurm.py プロジェクト: mr-c/toil
        def _getJobDetailsFromScontrol(self, slurmJobID):
            args = ['scontrol', 'show', 'job', str(slurmJobID)]

            stdout = call_command(args)
            if isinstance(stdout, str):
                lines = stdout.splitlines()
            elif isinstance(stdout, bytes):
                lines = stdout.decode('utf-8').splitlines()

            job = dict()
            for line in lines:
                for item in line.split():
                    logger.debug(f"{args[0]} output {item}")

                    # Output is in the form of many key=value pairs, multiple pairs on each line
                    # and multiple lines in the output. Each pair is pulled out of each line and
                    # added to a dictionary.
                    # Note: In some cases, the value itself may contain white-space. So, if we find
                    # a key without a value, we consider that key part of the previous value.
                    bits = item.split('=', 1)
                    if len(bits) == 1:
                        job[key] += ' ' + bits[0]
                    else:
                        key = bits[0]
                        job[key] = bits[1]

            state = job['JobState']
            try:
                exitcode = job['ExitCode']
                if exitcode is not None:
                    status, signal = [int(n) for n in exitcode.split(':')]
                    if signal > 0:
                        # A non-zero signal may indicate e.g. an out-of-memory killed job
                        status = 128 + signal
                    logger.debug(
                        "scontrol exit code is %s, returning status %d",
                        exitcode, status)
                    rc = status
                else:
                    rc = None
            except KeyError:
                rc = None

            return state, rc
コード例 #17
0
        def getRunningJobIDs(self):
            times = {}
            with self.runningJobsLock:
                currentjobs = dict((str(self.batchJobIDs[x][0]), x) for x in
                                   self.runningJobs)

            if check_lsf_json_output_supported:
                stdout = call_command(["bjobs","-json","-o", "jobid stat start_time"])

                bjobs_records = self.parseBjobs(stdout)
                if bjobs_records:
                    for single_item in bjobs_records:
                        if single_item['STAT'] == 'RUN' and single_item['JOBID'] in currentjobs:
                            jobstart = parse(single_item['START_TIME'], default=datetime.now(tzlocal()))
                            times[currentjobs[single_item['JOBID']]] = datetime.now(tzlocal()) \
                            - jobstart
            else:
                times = self.fallbackRunningJobIDs(currentjobs)
            return times
コード例 #18
0
        def getJobExitCodeBACCT(self,job):
            # if not found in bjobs, then try bacct (slower than bjobs)
            logger.debug("bjobs failed to detect job - trying bacct: "
                         "{}".format(job))

            args = ["bacct", "-l", str(job)]
            stdout = call_command(args)
            process_output = stdout.split('\n')
            for line in process_output:
                if line.find("Completed <done>") > -1 or line.find("<DONE>") > -1:
                    logger.debug("Detected job completed for job: "
                                 "{}".format(job))
                    return 0
                elif line.find("Completed <exit>") > -1 or line.find("<EXIT>") > -1:
                    logger.error("Detected job failed for job: "
                                 "{}".format(job))
                    return 1
            logger.debug("Can't determine exit code for job or job still "
                         "running: {}".format(job))
            return None
コード例 #19
0
 def obtainSystemConstants(cls):
     # sinfo -Ne --format '%m,%c'
     # sinfo arguments:
     # -N for node-oriented
     # -h for no header
     # -e for exact values (e.g. don't return 32+)
     # --format to get memory, cpu
     max_cpu = 0
     max_mem = MemoryString('0')
     lines = call_command(['sinfo', '-Nhe', '--format', '%m %c']).split('\n')
     for line in lines:
         logger.debug("sinfo output %s", line)
         values = line.split()
         if len(values) < 2:
             continue
         mem, cpu = values
         max_cpu = max(max_cpu, int(cpu))
         max_mem = max(max_mem, MemoryString(mem + 'M'))
     if max_cpu == 0 or max_mem.byteVal() == 0:
         raise RuntimeError('sinfo did not return memory or cpu info')
     return max_cpu, max_mem
コード例 #20
0
        def getRunningJobIDs(self):
            # Should return a dictionary of Job IDs and number of seconds
            times = {}
            with self.runningJobsLock:
                currentjobs = dict((str(self.batchJobIDs[x][0]), x) for x in self.runningJobs)
            # currentjobs is a dictionary that maps a slurm job id (string) to our own internal job id
            # squeue arguments:
            # -h for no header
            # --format to get jobid i, state %t and time days-hours:minutes:seconds

            lines = call_command(['squeue', '-h', '--format', '%i %t %M']).split('\n')
            for line in lines:
                values = line.split()
                if len(values) < 3:
                    continue
                slurm_jobid, state, elapsed_time = values
                if slurm_jobid in currentjobs and state == 'R':
                    seconds_running = self.parse_elapsed(elapsed_time)
                    times[currentjobs[slurm_jobid]] = seconds_running

            return times
コード例 #21
0
ファイル: lsf.py プロジェクト: lorde-collab/toil
        def fallbackGetJobExitCode(self, job):
            args = ["bjobs", "-l", str(job)]
            logger.debug(
                "Checking job exit code for job via bjobs (fallback): "
                "{}".format(job))
            stdout = call_command(args)
            output = stdout.replace("\n                     ", "")
            process_output = output.split('\n')
            started = 0
            for line in process_output:
                if "Done successfully" in line or "Status <DONE>" in line:
                    logger.debug("bjobs detected job completed for job: "
                                 "{}".format(job))
                    return 0
                elif "New job is waiting for scheduling" in line:
                    logger.debug("bjobs detected job pending scheduling for "
                                 "job: {}".format(job))
                    return None
                elif "PENDING REASONS" in line or "Status <PEND>" in line:
                    logger.debug("bjobs detected job pending for job: "
                                 "{}".format(job))
                    return None
                elif "Exited with exit code" in line:
                    exit = int(line[line.find("Exited with exit code ") +
                                    22:].split('.')[0])
                    logger.error("bjobs detected job exit code "
                                 "{} for job {}".format(exit, job))
                    return exit
                elif "Completed <exit>" in line:
                    logger.error("bjobs detected job failed for job: "
                                 "{}".format(job))
                    return 1
                elif line.find("Started on ") > -1 or "Status <RUN>" in line:
                    started = 1
            if started == 1:
                logger.debug("bjobs detected job started but not completed: "
                             "{}".format(job))
                return None

            return self.getJobExitCodeBACCT(job)
コード例 #22
0
    def obtainSystemConstants(cls):
        stdout = call_command(["lshosts"])
        line = stdout.split('\n')[0]
        items = line.strip().split()
        num_columns = len(items)
        cpu_index = None
        mem_index = None
        for i in range(num_columns):
            if items[i] == 'ncpus':
                cpu_index = i
            elif items[i] == 'maxmem':
                mem_index = i

        if cpu_index is None or mem_index is None:
            raise RuntimeError(
                "lshosts command does not return ncpus or maxmem columns")

        maxCPU = 0
        maxMEM = MemoryString("0")
        for line in stdout.split('\n')[1:]:
            items = line.strip().split()
            if items:
                if len(items) < num_columns:
                    raise RuntimeError(
                        "lshosts output has a varying number of columns")
                if items[cpu_index] != '-' and int(
                        items[cpu_index]) > int(maxCPU):
                    maxCPU = int(items[cpu_index])
                if items[mem_index] != '-' and MemoryString(
                        items[mem_index]) > maxMEM:
                    maxMEM = MemoryString(items[mem_index])

        if maxCPU == 0 or maxMEM == MemoryString("0"):
            raise RuntimeError("lshosts returns null ncpus or maxmem info")

        logger.debug("Got the maxMEM: {}".format(maxMEM))
        logger.debug("Got the maxCPU: {}".format(maxCPU))

        return maxCPU, maxMEM
コード例 #23
0
 def submitJob(self, subLine):
     stdout = call_command(subLine)
     output = stdout.split('\n')[0].strip()
     result = int(output)
     return result
コード例 #24
0
 def killJob(self, jobID):
     call_command(['qdel', self.getBatchSystemID(jobID)])
コード例 #25
0
ファイル: lsf.py プロジェクト: rahul-yadav-supra/toil
        def getJobExitCode(self, lsfJobID):
            # the task is set as part of the job ID if using getBatchSystemID()
            job, task = (lsfJobID, None)
            if '.' in lsfJobID:
                job, task = lsfJobID.split('.', 1)

            # first try bjobs to find out job state
            args = ["bjobs", "-l", str(job)]
            logger.debug("Checking job exit code for job via bjobs: "
                         "{}".format(job))
            stdout = call_command(args)
            output = stdout.replace("\n                     ", "")
            process_output = output.split('\n')
            started = 0
            for line in process_output:
                if "Done successfully" in line or "Status <DONE>" in line:
                    logger.debug("bjobs detected job completed for job: "
                                 "{}".format(job))
                    return 0
                elif "New job is waiting for scheduling" in line:
                    logger.debug("bjobs detected job pending scheduling for "
                                 "job: {}".format(job))
                    return None
                elif "PENDING REASONS" in line or "Status <PEND>" in line:
                    logger.debug("bjobs detected job pending for job: "
                                 "{}".format(job))
                    return None
                elif "Exited with exit code" in line:
                    exit = int(line[line.find("Exited with exit code ") +
                                    22:].split('.')[0])
                    logger.error("bjobs detected job exit code "
                                 "{} for job {}".format(exit, job))
                    return exit
                elif "Completed <exit>" in line:
                    logger.error("bjobs detected job failed for job: "
                                 "{}".format(job))
                    return 1
                elif line.find("Started on ") > -1 or "Status <RUN>" in line:
                    started = 1
            if started == 1:
                logger.debug("bjobs detected job started but not completed: "
                             "{}".format(job))
                return None

            # if not found in bjobs, then try bacct (slower than bjobs)
            logger.debug("bjobs failed to detect job - trying bacct: "
                         "{}".format(job))

            args = ["bacct", "-l", str(job)]
            stdout = call_command(args)
            process_output = stdout.split('\n')
            for line in process_output:
                if line.find("Completed <done>") > -1 or line.find(
                        "<DONE>") > -1:
                    logger.debug("Detected job completed for job: "
                                 "{}".format(job))
                    return 0
                elif line.find("Completed <exit>") > -1 or line.find(
                        "<EXIT>") > -1:
                    logger.error("Detected job failed for job: "
                                 "{}".format(job))
                    return 1
            logger.debug("Can't determine exit code for job or job still "
                         "running: {}".format(job))
            return None
コード例 #26
0
ファイル: slurm.py プロジェクト: tmooney/toil
        def _getJobDetailsFromScontrol(self, job_id_list: list) -> dict:
            """
            Get SLURM job exit codes for the jobs in `job_id_list` by running `scontrol`.
            :param job_id_list: list of integer batch job IDs.
            :return: dict of job statuses, where key is the job-id, and value is a tuple
            containing the job's state and exit code.
            """
            args = ['scontrol', 'show', 'job']
            # `scontrol` can only return information about a single job,
            # or all the jobs it knows about.
            if len(job_id_list) == 1:
                args.append(str(job_id_list[0]))

            stdout = call_command(args)

            # Job records are separated by a blank line.
            if isinstance(stdout, str):
                job_records = stdout.strip().split('\n\n')
            elif isinstance(stdout, bytes):
                job_records = stdout.decode('utf-8').strip().split('\n\n')

            # Collect the job statuses in a dict; key is the job-id, value is a tuple containing
            # job state and exit status. Initialize dict before processing output of `scontrol`.
            job_statuses = {}
            for job_id in job_id_list:
                job_statuses[job_id] = (None, None)

            # `scontrol` will report "No jobs in the system", if there are no jobs in the system,
            # and if no job-id was passed as argument to `scontrol`.
            if len(job_records
                   ) > 0 and job_records[0] == "No jobs in the system":
                return job_statuses

            for record in job_records:
                job = {}
                for line in record.splitlines():
                    for item in line.split():
                        #logger.debug("%s output %s", args[0], item)
                        # Output is in the form of many key=value pairs, multiple pairs on each line
                        # and multiple lines in the output. Each pair is pulled out of each line and
                        # added to a dictionary.
                        # Note: In some cases, the value itself may contain white-space. So, if we find
                        # a key without a value, we consider that key part of the previous value.
                        bits = item.split('=', 1)
                        if len(bits) == 1:
                            job[key] += ' ' + bits[0]
                        else:
                            key = bits[0]
                            job[key] = bits[1]
                    # The first line of the record contains the JobId. Stop processing the remainder
                    # of this record, if we're not interested in this job.
                    job_id = int(job['JobId'])
                    if job_id not in job_id_list:
                        logger.debug("%s job %d is not in the list", args[0],
                                     job_id)
                        break
                if job_id not in job_id_list:
                    continue
                state = job['JobState']
                logger.debug("%s state of job %s is %s", args[0], job_id,
                             state)
                try:
                    exitcode = job['ExitCode']
                    if exitcode is not None:
                        status, signal = [int(n) for n in exitcode.split(':')]
                        if signal > 0:
                            # A non-zero signal may indicate e.g. an out-of-memory killed job
                            status = 128 + signal
                        logger.debug(
                            "%s exit code of job %d is %s, return status %d",
                            args[0], job_id, exitcode, status)
                        rc = status
                    else:
                        rc = None
                except KeyError:
                    rc = None
                job_statuses[job_id] = (state, rc)
            logger.debug("%s returning job statuses: %s", args[0],
                         job_statuses)
            return job_statuses
コード例 #27
0
ファイル: torque.py プロジェクト: carissafletcher/toil
 def submitJob(self, subLine):
     return call_command(subLine)
コード例 #28
0
        def getJobExitCode(self, lsfJobID):
            # the task is set as part of the job ID if using getBatchSystemID()
            if "NOT_SUBMITTED" in lsfJobID:
                logger.error("bjobs detected job failed to submit")
                return 1

            job, task = (lsfJobID, None)
            if '.' in lsfJobID:
                job, task = lsfJobID.split('.', 1)

            self.parseMaxMem(job)
            # first try bjobs to find out job state
            if check_lsf_json_output_supported:
                args = [
                    "bjobs", "-json", "-o",
                    "user exit_code stat exit_reason pend_reason",
                    str(job)
                ]
                logger.debug("Checking job exit code for job via bjobs: "
                             "{}".format(job))
                stdout = call_command(args)
                bjobs_records = self.parseBjobs(stdout)
                if bjobs_records:
                    process_output = bjobs_records[0]
                    if 'STAT' in process_output:
                        process_status = process_output['STAT']
                        if process_status == 'DONE':
                            logger.debug(
                                "bjobs detected job completed for job: {}".
                                format(job))
                            return 0
                        if process_status == 'PEND':
                            pending_info = ""
                            if 'PEND_REASON' in process_output:
                                if process_output['PEND_REASON']:
                                    pending_info = "\n" + \
                                        process_output['PEND_REASON']
                            logger.debug(
                                "bjobs detected job pending with: {}\nfor job: {}"
                                .format(pending_info, job))
                            return None
                        if process_status == 'EXIT':
                            exit_code = 1
                            exit_reason = ""
                            if 'EXIT_CODE' in process_output:
                                exit_code_str = process_output['EXIT_CODE']
                                if exit_code_str:
                                    exit_code = int(exit_code_str)
                            if 'EXIT_REASON' in process_output:
                                exit_reason = process_output['EXIT_REASON']
                            exit_info = ""
                            if exit_code:
                                exit_info = "\nexit code: {}".format(exit_code)
                            if exit_reason:
                                exit_info += "\nexit reason: {}".format(
                                    exit_reason)
                            logger.error(
                                "bjobs detected job failed with: {}\nfor job: {}"
                                .format(exit_info, job))
                            if "TERM_MEMLIMIT" in exit_reason:
                                return BatchJobExitReason.MEMLIMIT
                            return exit_code
                        if process_status == 'RUN':
                            logger.debug(
                                "bjobs detected job started but not completed for job: {}"
                                .format(job))
                            return None
                        if process_status in {'PSUSP', 'USUSP', 'SSUSP'}:
                            logger.debug(
                                "bjobs detected job suspended for job: {}".
                                format(job))
                            return None

                        return self.getJobExitCodeBACCT(job)
            else:
                return self.fallbackGetJobExitCode(job)
コード例 #29
0
ファイル: miscTests.py プロジェクト: thiagogenez/toil
 def test_call_command_err(self):
     with self.assertRaisesRegex(
             CalledProcessErrorStderr,
             "^Command '\\['cat', '/dev/Frankenheimer']' exit status 1: cat: /dev/Frankenheimer: No such file or directory\n$"
     ):
         call_command(["cat", "/dev/Frankenheimer"])
コード例 #30
0
ファイル: miscTests.py プロジェクト: thiagogenez/toil
 def test_call_command_ok(self):
     o = call_command(["echo", "Fred"])
     self.assertEqual("Fred\n", o)
     self.assertTrue(isinstance(o, str), str(type(o)))