def getListOfJobStates(jobIds, username): jobStates = [] with guard: for i in range(len(jobs)): if not jobIds or jobs[i].jobId in jobIds: jobStates.append( JobStatus( jobs[i].jobId, JobStatus.kRunning if jobs[i].running else JobStatus.kWaiting)) return jobStates
def getListOfJobStates(jobName, username = None, detailed = True): if detailed: command = "llq -u `whoami` -m -x" else: command = "llq -u `whoami` -m" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')") jobList = [] jobStates = [] currentJobId = -1 currentJobStatus = None; for line in stdout.split('\n'): line = line.rstrip('\n') if line.startswith("===== Job Step mgmt."): try: currentJobId = int(line[line.find(".")+1:line.rfind(".")]) currentJobStatus = JobStatus(currentJobId) except ValueError: raise batchelor.BatchelorException("parsing of llq output to get job id failed.") line = ' '.join(line.split()) if line.startswith("Job Name: "): if currentJobId < 0: raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.") name = line[10:] if name == jobName or jobName == None: jobList.append(currentJobId) jobStates.append(currentJobStatus) elif line.startswith("Step Virtual Memory: "): if currentJobId < 0: raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.") try: parsed = line.lstrip().lstrip('Step Virtual Memory:').split() currentJobStatus.setMemoryUsage( float(parsed[0]) * _kMemoryUnits[parsed[1]], 0) except ValueError: raise batchelor.BatchelorException("parsing of llq output to get job id failed.") elif line.startswith("Status: "): if currentJobId < 0: raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.") else: status = line.lstrip().lstrip("Status: ") currentJobStatus.setStatus(JobStatus.kUnknown, name = status) if status == 'Running': currentJobStatus.setStatus(JobStatus.kRunning) elif status == 'I' or status == 'Idle' or status == 'Pending': currentJobStatus.setStatus(JobStatus.kWaiting) elif status == 'Submission Error' or status == 'Terminated' or status == 'Removed' or status == 'Remove Pending': currentJobStatus.setStatus(JobStatus.kError) elif line.startswith("Step User Time: "): if currentJobId < 0: raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.") time_str = line.lstrip().lstrip("Step User Time:").split(':') try: hours = float(time_str[0]) minuts = float(time_str[1]) seconds = float(time_str[2]) total_time = hours + minuts / 60.0 + seconds / 3600.0 currentJobStatus.setCpuTime(total_time, 0) except ValueError: raise batchelor.BatchelorException("parsing of llq output to get job id failed.") return jobStates
def getListOfJobStates(select_jobIDs, username): # get list of all jobs if username == None: command = "qstat" else: command = "qstat -u {0}".format(username) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n')[2:] try: jobIDs = [int(job.split()[0]) for job in jobList] jobStates = [job.split()[4] for job in jobList] except ValueError: raise batchelor.BatchelorException( "parsing of qstat output to get job id failed.") list_of_states = [] for i, jobID in enumerate(jobIDs): if select_jobIDs == None or jobID in select_jobIDs: job_status = JobStatus(jobID) job_status.setStatus(JobStatus.kUnknown, name=jobStates[i]) if jobStates[i] == 'qw' or jobStates[i] == 'hqw': job_status.setStatus(JobStatus.kWaiting) elif jobStates[i] == 't': job_status.setStatus(JobStatus.kTransmitting) elif jobStates[i] == 'd' or jobStates[i] == 'dr' or jobStates[ i] == 'dt': job_status.setStatus(JobStatus.kDeletion) elif jobStates[i] == 'Eq': job_status.setStatus(JobStatus.kError) elif jobStates[i] == 'r' or jobStates[i] == 'hr': # get detailed job information command = "qstat -xml -j {0}".format(jobID) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException( "qstat failed (stderr: '" + stderr + "')") elif 'unknown_jobs' in stdout: continue # the job has been ended between the qstat command and now else: try: root = ElementTree.fromstring(stdout) for child in root[0]: for task in child.findall('JB_ja_tasks'): for sublist in task.findall('ulong_sublist'): task_number = sublist.findall( 'JAT_task_number') if task_number: task_number = int(task_number[0].text) job_status.setStatus( JobStatus.kRunning) for usage_list in sublist.findall( 'JAT_scaled_usage_list'): for scaled in usage_list.findall( 'scaled'): name = scaled.findall( 'UA_name')[0].text value = scaled.findall( 'UA_value')[0].text if name == 'cpu': job_status.setCpuTime( float(value) / 3600.0, task_number) elif name == 'vmem': job_status.setMemoryUsage( float(value) / (1024.0)**3, task_number) except xml.etree.ElementTree.ParseError as e: raise batchelor.BatchelorException( "xml-parser could not parse output of qstat -xml -j {0}: {1}" .format(jobID, e)) # end of parsing through the xml tree list_of_states.append(job_status) # end of if jobs belongs to the selected jobs # end of loop over all jobs return list_of_states
def getListOfJobStates(select_jobIDs, username): # get list of all jobs if username == None: command = "qstat" else: command = "qstat -u {0}".format(username) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n')[2:] try: jobIDs = [ int(job.split()[0]) for job in jobList ] jobStates = [ job.split()[4] for job in jobList ]; except ValueError: raise batchelor.BatchelorException("parsing of qstat output to get job id failed.") list_of_states = []; for i, jobID in enumerate(jobIDs): if select_jobIDs == None or jobID in select_jobIDs: job_status = JobStatus(jobID); job_status.setStatus( JobStatus.kUnknown, name = jobStates[i] ); if jobStates[i] == 'qw' or jobStates[i] == 'hqw': job_status.setStatus( JobStatus.kWaiting ); elif jobStates[i] == 't': job_status.setStatus( JobStatus.kTransmitting ) elif jobStates[i] == 'd' or jobStates[i] == 'dr' or jobStates[i] == 'dt': job_status.setStatus( JobStatus.kDeletion) elif jobStates[i] == 'Eq': job_status.setStatus( JobStatus.kError ); elif jobStates[i] == 'r' or jobStates[i] == 'hr': # get detailed job information command = "qstat -xml -j {0}".format(jobID); (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") elif 'unknown_jobs' in stdout: continue; # the job has been ended between the qstat command and now else: try: root = ElementTree.fromstring( stdout ); for child in root[0]: for task in child.findall('JB_ja_tasks'): for sublist in task.findall('ulong_sublist'): task_number = sublist.findall('JAT_task_number') if task_number: task_number = int(task_number[0].text) job_status.setStatus( JobStatus.kRunning ); for usage_list in sublist.findall('JAT_scaled_usage_list'): for scaled in usage_list.findall('scaled'): name = scaled.findall('UA_name')[0].text value = scaled.findall('UA_value')[0].text if name == 'cpu': job_status.setCpuTime(float(value) / 3600.0, task_number); elif name == 'vmem': job_status.setMemoryUsage(float(value) / (1024.0)**3, task_number); except xml.etree.ElementTree.ParseError as e: raise batchelor.BatchelorException("xml-parser could not parse output of qstat -xml -j {0}: {1}".format(jobID, e)) # end of parsing through the xml tree list_of_states.append( job_status ); # end of if jobs belongs to the selected jobs # end of loop over all jobs return list_of_states;
def getListOfJobStates(jobName, username=None, detailed=True): command = "squeue --clusters=serial -u $(whoami) -l -h" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("squeue failed (stderr: '" + stderr + "')") jobList = [] jobStates = [] for line in stdout.split('\n'): if line.startswith("CLUSTER: serial"): continue line = line.rstrip('\n') lineSplit = line.split() try: currentJobId = int(lineSplit[0]) currentJobStatus = JobStatus(currentJobId) # name name = lineSplit[2] if name == jobName or jobName == None: jobList.append(currentJobId) jobStates.append(currentJobStatus) # status status = lineSplit[4] currentJobStatus.setStatus(JobStatus.kUnknown, name=status) if status == 'RUNNING': currentJobStatus.setStatus(JobStatus.kRunning) elif status == 'PENDING' or status == 'SUSPENDED' or status == 'COMPLETING' or status == 'COMPLETED' or status == 'COMPLETI': currentJobStatus.setStatus(JobStatus.kWaiting) elif status == 'CANCELLED' or status == 'FAILED' or status == 'TIMEOUT' or status == 'NODE_FAIL': currentJobStatus.setStatus(JobStatus.kError) else: print "Unknown job status", status # time time_str = lineSplit[5] try: hours = 0.0 if '-' in time_str: time_str = time_str.split('-') hours += float(time_str[0]) * 24 time_str = time_str[1].split(':') else: time_str = time_str.split(':') seconds = float(time_str[-1]) minutes = float(time_str[-2]) if (len(time_str) > 2): hours += float(time_str[-3]) total_time = hours + minutes / 60.0 + seconds / 3600.0 currentJobStatus.setCpuTime(total_time, 0) except ValueError: raise batchelor.BatchelorException( "parsing of squeue output to get time information failed. ({0})" .format(lineSplit[5])) except ValueError: raise batchelor.BatchelorException( "parsing of squeue output to get job id failed.") return jobStates
def getListOfJobStates(jobName, username=None, detailed=True): if detailed: command = "llq -u `whoami` -m -x" else: command = "llq -u `whoami` -m" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')") jobList = [] jobStates = [] currentJobId = -1 currentJobStatus = None for line in stdout.split('\n'): line = line.rstrip('\n') if line.startswith("===== Job Step mgmt."): try: currentJobId = int(line[line.find(".") + 1:line.rfind(".")]) currentJobStatus = JobStatus(currentJobId) except ValueError: raise batchelor.BatchelorException( "parsing of llq output to get job id failed.") line = ' '.join(line.split()) if line.startswith("Job Name: "): if currentJobId < 0: raise batchelor.BatchelorException( "parsing of llq output failed, got job name before job id." ) name = line[10:] if name == jobName or jobName == None: jobList.append(currentJobId) jobStates.append(currentJobStatus) elif line.startswith("Step Virtual Memory: "): if currentJobId < 0: raise batchelor.BatchelorException( "parsing of llq output failed, got job name before job id." ) try: parsed = line.lstrip().lstrip('Step Virtual Memory:').split() currentJobStatus.setMemoryUsage( float(parsed[0]) * _kMemoryUnits[parsed[1]], 0) except ValueError: raise batchelor.BatchelorException( "parsing of llq output to get job id failed.") elif line.startswith("Status: "): if currentJobId < 0: raise batchelor.BatchelorException( "parsing of llq output failed, got job name before job id." ) else: status = line.lstrip().lstrip("Status: ") currentJobStatus.setStatus(JobStatus.kUnknown, name=status) if status == 'Running': currentJobStatus.setStatus(JobStatus.kRunning) elif status == 'I' or status == 'Idle' or status == 'Pending': currentJobStatus.setStatus(JobStatus.kWaiting) elif status == 'Submission Error' or status == 'Terminated' or status == 'Removed' or status == 'Remove Pending': currentJobStatus.setStatus(JobStatus.kError) elif line.startswith("Step User Time: "): if currentJobId < 0: raise batchelor.BatchelorException( "parsing of llq output failed, got job name before job id." ) time_str = line.lstrip().lstrip("Step User Time:").split(':') try: hours = float(time_str[0]) minuts = float(time_str[1]) seconds = float(time_str[2]) total_time = hours + minuts / 60.0 + seconds / 3600.0 currentJobStatus.setCpuTime(total_time, 0) except ValueError: raise batchelor.BatchelorException( "parsing of llq output to get job id failed.") return jobStates
def getListOfJobStates(jobName, username = None, detailed = True): command = "squeue --clusters=serial -u $(whoami) -l -h" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("squeue failed (stderr: '" + stderr + "')") jobList = [] jobStates = [] for line in stdout.split('\n'): if line.startswith("CLUSTER: serial"): continue; line = line.rstrip('\n') lineSplit = line.split() try: currentJobId = int(lineSplit[0]) currentJobStatus = JobStatus(currentJobId) # name name = lineSplit[2] if name == jobName or jobName == None: jobList.append(currentJobId) jobStates.append(currentJobStatus) # status status = lineSplit[4] currentJobStatus.setStatus(JobStatus.kUnknown, name = status) if status=='RUNNING': currentJobStatus.setStatus(JobStatus.kRunning) elif status=='PENDING' or status=='SUSPENDED' or status=='COMPLETING' or status=='COMPLETED' or status=='COMPLETI': currentJobStatus.setStatus(JobStatus.kWaiting) elif status=='CANCELLED' or status=='FAILED' or status=='TIMEOUT' or status=='NODE_FAIL': currentJobStatus.setStatus(JobStatus.kError) else: print "Unknown job status", status # time time_str = lineSplit[5] try: hours = 0.0 if '-' in time_str: time_str = time_str.split('-') hours += float(time_str[0])*24 time_str = time_str[1].split(':') else: time_str = time_str.split(':') seconds = float(time_str[-1]) minutes = float(time_str[-2]) if(len(time_str) > 2): hours += float(time_str[-3]) total_time = hours + minutes / 60.0 + seconds / 3600.0 currentJobStatus.setCpuTime(total_time, 0) except ValueError: raise batchelor.BatchelorException("parsing of squeue output to get time information failed. ({0})".format(lineSplit[5])) except ValueError: raise batchelor.BatchelorException("parsing of squeue output to get job id failed.") return jobStates