def submitJob(config, command, outputFile, jobName, arrayStart = None, arrayEnd = None, arrayStep = None): (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: scriptFile.write(command) cmnd = "qsub " cmnd += "-j y " cmnd += "" if jobName is None else ("-N " + jobName + " ") if arrayStart is not None: cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(arrayStep) + " " cmnd += "-o " + outputFile + " " cmnd += "-P " + config.get(submoduleIdentifier(), "project") + " " cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " " cmnd += "-l h_vmem=" + config.get(submoduleIdentifier(), "memory") + " " cmnd += _getExcludedHostsString(config) cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')") # example output: "Your job 1601905 ("J2415c980b8") has been submitted" if arrayStart is not None: jobId = stdout.lstrip("Your job-array ") jobId = jobId[:jobId.find('.')] else: jobId = stdout.lstrip("Your job ") jobId = jobId[:jobId.find(' ')] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def submitJob(config, command, outputFile, jobName, wd=None): # check if only a certain amount of active jobs is allowd if config.has_option(submoduleIdentifier(), "max_active_jobs"): max_active_jobs = int( config.get(submoduleIdentifier(), "max_active_jobs")) i = 0 waitTime = 90 while len(getListOfActiveJobs(None)) >= max_active_jobs: if i == 0: sys.stdout.write("Waiting for free slots") sys.stdout.flush() time.sleep(waitTime) # wait 1.5 min i += 1 if i > 0: sys.stdout.write("\r") if wd == None: wd = os.getcwd() (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#SBATCH -D " + wd + "\n") tempFile.write("#SBATCH -o " + outputFile + "\n") tempFile.write("#SBATCH --time=" + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") tempFile.write("#SBATCH --mem=" + config.get(submoduleIdentifier(), "memory") + "\n") if jobName is not None: tempFile.write("#SBATCH -J " + jobName + "\n") tempFile.write("#SBATCH --get-user-env \n") tempFile.write("#SBATCH --export=NONE \n") tempFile.write("#SBATCH --clusters=serial \n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write(command) cmnd = "sbatch " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) batchelor.runCommand("rm -f " + fileName) if returncode != 0: raise batchelor.BatchelorException("sbatch failed (stderr: '" + stderr + "')") jobId = stdout.split()[3] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing output of sbatch to get job id failed.') return jobId
def submitJob(config, command, outputFile, jobName, wd = None, arrayStart = None, arrayEnd = None, arrayStep = None, priority=None, ompNumThreads=None): # some checks of the job-settings if wd and os.path.realpath(wd).count(os.path.realpath(os.path.expanduser('~'))): raise batchelor.BatchelorException("The given working-directory is in your home-folder which is no allowed at E18: '{0}'".format(wd)) if os.path.realpath(outputFile).count(os.path.realpath(os.path.expanduser('~'))): raise batchelor.BatchelorException("The given output-file is in your home-folder which is no allowed at E18: '{0}'".format(outputFile)) if priority: priority = max(int(-1024 + 2048 * (priority+1.0)/2.0), -1023) (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: if ompNumThreads is not None: scriptFile.write("export OMP_NUM_THREADS={0}\n".format(ompNumThreads)) scriptFile.write(command) cmnd = "qsub " cmnd += "-j y " cmnd += "-b no " cmnd += "-m n " cmnd += "" if jobName is None else ("-N " + jobName + " ") if arrayStart is not None: cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(arrayStep) + " " cmnd += "-o '" + outputFile + "' " cmnd += "-wd '" + ("/tmp/" if not wd else wd) + "' " if config.has_option(submoduleIdentifier(), "shortqueue") and config.get(submoduleIdentifier(), "shortqueue") in [1, "1", "TRUE", "true", "True"]: cmnd += "-l short=1 " elif config.has_option(submoduleIdentifier(), "longqueue") and config.get(submoduleIdentifier(), "longqueue") in [1, "1", "TRUE", "true", "True"]: cmnd += "-l long=1 " else: cmnd += "-l medium=1 " cmnd += "-l h_pmem=" + config.get(submoduleIdentifier(), "memory") + " " cmnd += "-l arch=" + config.get(submoduleIdentifier(), "arch") + " " cmnd += _getExcludedHostsString(config) cmnd += "-p {0} ".format(priority) if priority else "" cmnd += "-pe mt {0} ".format(ompNumThreads) if ompNumThreads is not None else "" cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')") # example output: "Your job 1601905 ("J2415c980b8") has been submitted" if arrayStart is not None: jobId = stdout.lstrip("Your job-array ") jobId = jobId[:jobId.find('.')] else: jobId = stdout.lstrip("Your job ") jobId = jobId[:jobId.find(' ')] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def submitJob(config, command, outputFile, jobName, wd=None, arrayStart=None, arrayEnd=None, arrayStep=None): (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: scriptFile.write(command) if arrayStart is not None: if (jobName is None) or (len(jobName) is 0): jobName = ''.join(random.sample(string.lowercase, 7)) jobName = jobName + "[" + str(arrayStart) + "-" + str( arrayEnd) + ":" + str(arrayStep) + "]" cmnd = "bsub " cmnd += "" if jobName is None else ("-J " + jobName + " ") cmnd += "-o " + outputFile + " " cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " " cmnd += "-R '" cmnd += "-cwd '{0}'".format(wd) if wd else "" cmnd += " select[type=" + config.get(submoduleIdentifier(), "type") + "]" cmnd += " rusage[pool=" + config.get(submoduleIdentifier(), "pool") + "]" try: cmnd += " rusage[mem=" + config.get(submoduleIdentifier(), "memory") + "]" cmnd += " select[maxmem>" + config.get(submoduleIdentifier(), "memory") + "]" except ConfigParser.NoOptionError: pass cmnd += _getExcludedHostsString(config) cmnd += "' " cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("bsub failed (stderr: '" + stderr + "')") # example output: Job <533476534> is submitted to queue <1nd>. jobId = stdout.lstrip("Job <") jobId = jobId[:jobId.find(">")] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing of bsub output to get job id failed.') batchelor.runCommand('rm -f ' + fileName) return jobId
def getListOfActiveJobs(jobName): if jobName is None: command = "llq -u `whoami`" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')") if stdout == "llq: There is currently no job status to report.": return [] stringList = [job.split()[0] for job in stdout.split('\n')[2:-2]] jobList = [] try: for item in stringList: jobId = int(item[item.find(".") + 1:item.rfind(".")]) if jobId not in jobList: jobList.append(jobId) except ValueError: raise batchelor.BatchelorException( "parsing of llq output to get job id failed.") return jobList (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) command = "llq -u `whoami` -m &> " + fileName (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')") jobList = [] currentJobId = -1 with open(fileName, 'r') as llqOutput: for line in llqOutput: line = line[:-1] if line.startswith("===== Job Step mgmt."): try: currentJobId = int(line[line.find(".") + 1:line.rfind(".")]) except ValueError: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException( "parsing of llq output to get job id failed.") line = ' '.join(line.split()) if line.startswith("Job Name: "): if currentJobId < 0: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException( "parsing of llq output failed, got job name before job id." ) name = line[10:] if name == jobName: jobList.append(currentJobId) batchelor.runCommand("rm -f " + fileName) return jobList
def submitJob(config, command, outputFile, jobName, wd = None): # check if only a certain amount of active jobs is allowd if config.has_option(submoduleIdentifier(), "max_active_jobs"): max_active_jobs = int(config.get(submoduleIdentifier(), "max_active_jobs")) i=0; waitTime = 90 while len(getListOfActiveJobs(None)) >= max_active_jobs: if i == 0: sys.stdout.write("Waiting for free slots") sys.stdout.flush() time.sleep(waitTime); # wait 1.5 min i+=1 if i > 0: sys.stdout.write("\r") if wd == None: wd = os.getcwd() (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#SBATCH -D " + wd + "\n") tempFile.write("#SBATCH -o " + outputFile + "\n") tempFile.write("#SBATCH --time=" + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") tempFile.write("#SBATCH --mem=" + config.get(submoduleIdentifier(), "memory") + "\n") if jobName is not None: tempFile.write("#SBATCH -J " + jobName + "\n") tempFile.write("#SBATCH --get-user-env \n") tempFile.write("#SBATCH --export=NONE \n") tempFile.write("#SBATCH --clusters=serial \n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write(command) cmnd = "sbatch " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) batchelor.runCommand("rm -f " + fileName) if returncode != 0: raise batchelor.BatchelorException("sbatch failed (stderr: '" + stderr + "')") jobId = stdout.split()[3] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing output of sbatch to get job id failed.') return jobId
def submitJob(config, command, outputFile, jobName, wd = None, arrayStart = None, arrayEnd = None, arrayStep = None): if arrayStart is not None or arrayEnd is not None or arrayStep is not None: raise BatchelorException("Array jobs are not (yet) implementet for CERNs HTCondor system") filesDir = os.path.join(os.getcwd(), '.log') if " " in filesDir: raise BatchelorException("Cannot handle submit directories with whitespaces") if not os.path.exists(filesDir): os.makedirs(filesDir) (fileDescriptor, submitFileName) = tempfile.mkstemp(dir=filesDir, prefix='submitFiles_', suffix='.submit') os.close(fileDescriptor) atexit.register(lambda: os.remove( submitFileName )) (fileDescriptor, scriptFileName) = tempfile.mkstemp(dir=filesDir, prefix='scriptFiles_', suffix='.sh') os.close(fileDescriptor) atexit.register(lambda: os.remove( scriptFileName )) os.chmod(scriptFileName, 0755) batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + scriptFileName) with open(scriptFileName, 'a') as scriptFile: scriptFile.write(command) with open(submitFileName, 'w') as submitFile: outputFile = os.path.abspath(outputFile) submitFile.write("executable = {0}\n".format(scriptFileName)) if outputFile: submitFile.write("output = {0}\n".format(outputFile)) submitFile.write("log = {0}.condor\n".format(outputFile)) submitFile.write("error = {0}.err\n".format(outputFile)) submitFile.write("should_transfer_files = NO\n") # Disable file transport submitFile.write("request_cpus = 1\n") submitFile.write("request_memory = {0}\n".format(config.get(submoduleIdentifier(), "memory"))) submitFile.write("request_disk = {0}\n".format(config.get(submoduleIdentifier(), "disk"))) submitFile.write("+JobFlavour = \"{0}\"\n".format(config.get(submoduleIdentifier(), "flavour"))) submitFile.write("queue 1\n") cmnd = "condor_submit '{0}'".format(submitFileName) if jobName: cmnd += " -batch-name {0} ".format(jobName) kwargs = {} if wd: kwargs['wd'] = wd (returncode, stdout, stderr) = batchelor.runCommand(cmnd, **kwargs) if returncode != 0: raise batchelor.BatchelorException("condor_submit failed (stderr: '" + stderr + "')") jobId = stdout.split('\n')[1].split()[5].rstrip(".") try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing of condor_submit output to get job id failed.') return jobId
def submitJob(config, command, outputFile, jobName, wd=None, arrayStart=None, arrayEnd=None, arrayStep=None): if wd: raise batchelor.BatchelorException( "Choosing the working directory is not jet implemented for {0}". format(submoduleIdentifier())) (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: scriptFile.write(command) cmnd = "qsub " cmnd += "-j y " cmnd += "" if jobName is None else ("-N " + jobName + " ") if arrayStart is not None: cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str( arrayStep) + " " cmnd += "-o " + outputFile + " " cmnd += "-P " + config.get(submoduleIdentifier(), "project") + " " cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " " cmnd += "-l h_vmem=" + config.get(submoduleIdentifier(), "memory") + " " cmnd += _getExcludedHostsString(config) cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')") # example output: "Your job 1601905 ("J2415c980b8") has been submitted" if arrayStart is not None: jobId = stdout.lstrip("Your job-array ") jobId = jobId[:jobId.find('.')] else: jobId = stdout.lstrip("Your job ") jobId = jobId[:jobId.find(' ')] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def getListOfActiveJobs(jobName): if jobName is None: command = "qstat" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n')[2:] try: return [int(job.split()[0]) for job in jobList] except ValueError: raise batchelor.BatchelorException( "parsing of qstat output to get job id failed.") command = "qstat -j " + jobName (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: if stderr.split( '\n' )[0][: -1] == "Following jobs do not exist or permissions are not sufficient:": return [] raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) command = "qstat -xml -j " + jobName + " > " + fileName (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") batchelor.runCommand( "awk '/<\?xml version='\"'\"'1.0'\"'\"'\?>/{n++}{print >\"" + fileName + "\" n \".awkOut\" }' " + fileName) batchelor.runCommand("rm -f " + fileName) xmlFiles = glob.glob(fileName + "*.awkOut") jobIds = [] for xmlFile in xmlFiles: tree = ElementTree.parse(xmlFile) root = tree.getroot() batchelor.runCommand("rm -f " + xmlFile) for child in root[0]: jobIdList = child.findall("JB_job_number") if len(jobIdList) != 1: raise batchelor.BatchelorException( "parsing xml from qstat failed") try: jobId = int(jobIdList[0].text) except ValueError: raise batchelor.BatchelorException( "parsing int from xml from qstat failed") jobIds.append(jobId) return jobIds
def getListOfJobStates(jobName, username=None, detailed=True): command = "squeue --clusters=serial -u $(whoami) -l -h" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("squeue failed (stderr: '" + stderr + "')") jobList = [] jobStates = [] for line in stdout.split('\n'): if line.startswith("CLUSTER: serial"): continue line = line.rstrip('\n') lineSplit = line.split() try: currentJobId = int(lineSplit[0]) currentJobStatus = JobStatus(currentJobId) # name name = lineSplit[2] if name == jobName or jobName == None: jobList.append(currentJobId) jobStates.append(currentJobStatus) # status status = lineSplit[4] currentJobStatus.setStatus(JobStatus.kUnknown, name=status) if status == 'RUNNING': currentJobStatus.setStatus(JobStatus.kRunning) elif status == 'PENDING' or status == 'SUSPENDED' or status == 'COMPLETING' or status == 'COMPLETED' or status == 'COMPLETI': currentJobStatus.setStatus(JobStatus.kWaiting) elif status == 'CANCELLED' or status == 'FAILED' or status == 'TIMEOUT' or status == 'NODE_FAIL': currentJobStatus.setStatus(JobStatus.kError) else: print "Unknown job status", status # time time_str = lineSplit[5] try: hours = 0.0 if '-' in time_str: time_str = time_str.split('-') hours += float(time_str[0]) * 24 time_str = time_str[1].split(':') else: time_str = time_str.split(':') seconds = float(time_str[-1]) minutes = float(time_str[-2]) if (len(time_str) > 2): hours += float(time_str[-3]) total_time = hours + minutes / 60.0 + seconds / 3600.0 currentJobStatus.setCpuTime(total_time, 0) except ValueError: raise batchelor.BatchelorException( "parsing of squeue output to get time information failed. ({0})" .format(lineSplit[5])) except ValueError: raise batchelor.BatchelorException( "parsing of squeue output to get job id failed.") return jobStates
def resetErrorJobs(jobName): for id in getListOfErrorJobs(jobName): command = "qmod -cj " + str(id) (returncode, stdout, stderr) = batchelor.runCommand(command) if stdout.find('cleared error state of job') is -1: raise batchelor.BatchelorException("qmod failed (stderr: '" + stderr + "')") return True
def getListOfActiveJobs(jobName): if jobName is None: command = "llq -u `whoami`" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')") if stdout == "llq: There is currently no job status to report.": return [] stringList = [ job.split()[0] for job in stdout.split('\n')[2:-2] ] jobList = [] try: for item in stringList: jobId = int(item[item.find(".")+1:item.rfind(".")]) if jobId not in jobList: jobList.append(jobId) except ValueError: raise batchelor.BatchelorException("parsing of llq output to get job id failed.") return jobList (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) command = "llq -u `whoami` -m &> " + fileName (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')") jobList = [] currentJobId = -1 with open(fileName, 'r') as llqOutput: for line in llqOutput: line = line[:-1] if line.startswith("===== Job Step mgmt."): try: currentJobId = int(line[line.find(".")+1:line.rfind(".")]) except ValueError: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException("parsing of llq output to get job id failed.") line = ' '.join(line.split()) if line.startswith("Job Name: "): if currentJobId < 0: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.") name = line[10:] if name == jobName: jobList.append(currentJobId) batchelor.runCommand("rm -f " + fileName) return jobList
def getListOfJobStates(jobName, username = None, detailed = True): command = "squeue --clusters=serial -u $(whoami) -l -h" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("squeue failed (stderr: '" + stderr + "')") jobList = [] jobStates = [] for line in stdout.split('\n'): if line.startswith("CLUSTER: serial"): continue; line = line.rstrip('\n') lineSplit = line.split() try: currentJobId = int(lineSplit[0]) currentJobStatus = JobStatus(currentJobId) # name name = lineSplit[2] if name == jobName or jobName == None: jobList.append(currentJobId) jobStates.append(currentJobStatus) # status status = lineSplit[4] currentJobStatus.setStatus(JobStatus.kUnknown, name = status) if status=='RUNNING': currentJobStatus.setStatus(JobStatus.kRunning) elif status=='PENDING' or status=='SUSPENDED' or status=='COMPLETING' or status=='COMPLETED' or status=='COMPLETI': currentJobStatus.setStatus(JobStatus.kWaiting) elif status=='CANCELLED' or status=='FAILED' or status=='TIMEOUT' or status=='NODE_FAIL': currentJobStatus.setStatus(JobStatus.kError) else: print "Unknown job status", status # time time_str = lineSplit[5] try: hours = 0.0 if '-' in time_str: time_str = time_str.split('-') hours += float(time_str[0])*24 time_str = time_str[1].split(':') else: time_str = time_str.split(':') seconds = float(time_str[-1]) minutes = float(time_str[-2]) if(len(time_str) > 2): hours += float(time_str[-3]) total_time = hours + minutes / 60.0 + seconds / 3600.0 currentJobStatus.setCpuTime(total_time, 0) except ValueError: raise batchelor.BatchelorException("parsing of squeue output to get time information failed. ({0})".format(lineSplit[5])) except ValueError: raise batchelor.BatchelorException("parsing of squeue output to get job id failed.") return jobStates
def deleteJobs(jobIds): if not jobIds: return True command = "llcancel" for jobId in jobIds: command += " mgmt." + str(jobId) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("llcancel failed (stderr: '" + stderr + "')") return True
def submitJob(config, command, outputFile, jobName, wd = None): if wd: raise batchelor.BatchelorException("Choosing the working directory is not jet implemented for {0}".format(submoduleIdentifier())) (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#@ group = " + config.get(submoduleIdentifier(), "group") + "\n") tempFile.write("#@ output = " + outputFile + "\n") tempFile.write("#@ error = " + outputFile + "\n") tempFile.write("#@ notification = " + config.get(submoduleIdentifier(), "notification") + "\n") tempFile.write("#@ notify_user = "******"notify_user") + "\n") tempFile.write("#@ node_usage = " + config.get(submoduleIdentifier(), "node_usage") + "\n") tempFile.write("#@ wall_clock_limit = " + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") tempFile.write("#@ resources = " + config.get(submoduleIdentifier(), "resources") + "\n") tempFile.write("#@ job_type = " + config.get(submoduleIdentifier(), "job_type") + "\n") tempFile.write("#@ class = " + config.get(submoduleIdentifier(), "job_type") + "\n") if jobName is not None: tempFile.write("#@ job_name = " + jobName + "\n") tempFile.write("#@ queue\n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write("exec 2>&1\n") tempFile.write("\n") tempFile.write(command) cmnd = "llsubmit - < " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException("llsubmit failed (stderr: '" + stderr + "')") # example output stdout: # llsubmit: The job "mgmt.12309" has been submitted. # # example output stderr: # # llsubmit: Stdin job command file written to "/tmp/loadlx_stdin.27558.CdoVxX". # # INFO: Project: pr83mo # INFO: Project's Expiration Date: 2015-01-31 # INFO: Budget: Total [cpuh] Used [cpuh] Credit [cpuh] # INFO: 1350000 1011028 (75%) 338972 (25%) # # llsubmit: Processed command file through Submit Filter: "/lrz/loadl/filter/submit_filter_c2pap.pl". jobId = stdout.split("\n")[0] jobId = jobId[jobId.find('"mgmt.')+6:jobId.rfind('"')] try: jobId = int(jobId) except ValueError: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException('parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def deleteJobs(jobIds): if not jobIds: return True command = "bkill" for jobId in jobIds: command += ' ' + str(jobId) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: if not 'Job has already finished' in stderr: raise batchelor.BatchelorException("bkill failed (stderr: '" + stderr + "')") return True
def deleteJobs(jobIds): if not jobIds: return True command = "condor_rm" for jobId in jobIds: command += ' ' + str(jobId) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: if not 'Couldn\'t find/remove all jobs matching constraint' in stderr: raise batchelor.BatchelorException("condor_rm failed (stderr: '" + stderr + "')") return True
def submitJob(config, command, outputFile, jobName, wd = None, arrayStart = None, arrayEnd = None, arrayStep = None): (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: scriptFile.write(command) if arrayStart is not None: if (jobName is None) or (len(jobName) is 0): jobName = ''.join(random.sample(string.lowercase,7)) jobName = jobName + "[" + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(arrayStep) + "]" cmnd = "bsub " cmnd += "" if jobName is None else ("-J " + jobName + " ") cmnd += "-o " + outputFile + " " cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " " cmnd += "-R '" cmnd += "-cwd '{0}'".format(wd) if wd else "" cmnd += " select[type=" + config.get(submoduleIdentifier(), "type") + "]" cmnd += " rusage[pool=" + config.get(submoduleIdentifier(), "pool") + "]" try: cmnd += " rusage[mem=" + config.get(submoduleIdentifier(), "memory") + "]" cmnd += " select[maxmem>" + config.get(submoduleIdentifier(), "memory") + "]" except ConfigParser.NoOptionError: pass cmnd += _getExcludedHostsString(config) cmnd += "' " cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("bsub failed (stderr: '" + stderr + "')") # example output: Job <533476534> is submitted to queue <1nd>. jobId = stdout.lstrip("Job <") jobId = jobId[:jobId.find(">")] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing of bsub output to get job id failed.') batchelor.runCommand('rm -f ' + fileName) return jobId
def getListOfActiveJobs(jobName): if jobName is None: command = "qstat" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n')[2:] try: return [ int(job.split()[0]) for job in jobList ] except ValueError: raise batchelor.BatchelorException("parsing of qstat output to get job id failed.") command = "qstat -j " + jobName (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: if stderr.split('\n')[0][:-1] == "Following jobs do not exist:": return [] raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) command = "qstat -xml -j " + jobName + " > " + fileName (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") batchelor.runCommand("awk '/<\?xml version='\"'\"'1.0'\"'\"'\?>/{n++}{print >\"" + fileName + "\" n \".awkOut\" }' " + fileName) batchelor.runCommand("rm -f " + fileName) xmlFiles = glob.glob(fileName + "*.awkOut") jobIds = [] for xmlFile in xmlFiles: tree = ElementTree.parse(xmlFile) root = tree.getroot() batchelor.runCommand("rm -f " + xmlFile) for child in root[0]: jobIdList = child.findall("JB_job_number") if len(jobIdList) != 1: raise batchelor.BatchelorException("parsing xml from qstat failed") try: jobId = int(jobIdList[0].text) except ValueError: raise batchelor.BatchelorException("parsing int from xml from qstat failed") jobIds.append(jobId) return jobIds
def getListOfActiveJobs(jobName): command = "bjobs" if not jobName is None: command = command + " -J " + jobName (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("bjobs failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n')[1:] try: return [ int(job.split()[0]) for job in jobList ] except ValueError: raise batchelor.BatchelorException("parsing of bjobs output to get job id failed.")
def submitJob(config, command, outputFile, jobName): (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#@ group = " + config.get(submoduleIdentifier(), "group") + "\n") tempFile.write("#@ output = " + outputFile + "\n") tempFile.write("#@ notification = " + config.get(submoduleIdentifier(), "notification") + "\n") tempFile.write("#@ notify_user = "******"notify_user") + "\n") tempFile.write("#@ node_usage = " + config.get(submoduleIdentifier(), "node_usage") + "\n") tempFile.write("#@ wall_clock_limit = " + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") tempFile.write("#@ resources = " + config.get(submoduleIdentifier(), "resources") + "\n") tempFile.write("#@ job_type = " + config.get(submoduleIdentifier(), "job_type") + "\n") tempFile.write("#@ class = " + config.get(submoduleIdentifier(), "job_type") + "\n") if jobName is not None: tempFile.write("#@ job_name = " + jobName + "\n") tempFile.write("#@ queue\n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write(command) cmnd = "llsubmit - < " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException("llsubmit failed (stderr: '" + stderr + "')") # example output stdout: # llsubmit: The job "mgmt.12309" has been submitted. # # example output stderr: # # llsubmit: Stdin job command file written to "/tmp/loadlx_stdin.27558.CdoVxX". # # INFO: Project: pr83mo # INFO: Project's Expiration Date: 2015-01-31 # INFO: Budget: Total [cpuh] Used [cpuh] Credit [cpuh] # INFO: 1350000 1011028 (75%) 338972 (25%) # # llsubmit: Processed command file through Submit Filter: "/lrz/loadl/filter/submit_filter_c2pap.pl". jobId = stdout.split("\n")[0] jobId = jobId[jobId.find('"mgmt.')+6:jobId.rfind('"')] try: jobId = int(jobId) except ValueError: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException('parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def getListOfActiveJobs(jobName): command = "bjobs" if not jobName is None: command = command + " -J " + jobName (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("bjobs failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n')[1:] try: return [int(job.split()[0]) for job in jobList] except ValueError: raise batchelor.BatchelorException( "parsing of bjobs output to get job id failed.")
def getListOfActiveJobs(jobName): command = "condor_q -format \"%d.\" ClusterId -format \"%d\n\" ProcId " if jobName: command += "-constraint 'JobBatchName == \"{0}\"' ".format(jobName) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("condor_q failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n') jobs = [] for job in jobList: job = job.split() if len(job) > 0: try: jobID = int(job[0].rstrip(".0")) jobs.append(jobID) except ValueError: raise batchelor.BatchelorException("Cannot parse return of condor_q (stdout: '" + stdout + "')") return jobs
def getListOfRunningJobs(jobName): listOfActiveJobs = getListOfActiveJobs(jobName) command = "qstat" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") qstatLines = stdout.split('\n')[2:] listOfErrorJobs = [] for line in qstatLines: lineList = line.split() jobId = -1 try: jobId = int(lineList[0]) except ValueError: raise batchelor.BatchelorException("parsing of qstat output to get job id failed.") if jobId not in listOfActiveJobs: continue if lineList[4] == "r": listOfErrorJobs.append(jobId) return listOfErrorJobs
def getListOfErrorJobs(jobName): listOfActiveJobs = getListOfActiveJobs(jobName) command = "qstat" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") qstatLines = stdout.split('\n')[2:] listOfErrorJobs = [] for line in qstatLines: lineList = line.split() jobId = -1 try: jobId = int(lineList[0]) except ValueError: raise batchelor.BatchelorException( "parsing of qstat output to get job id failed.") if jobId not in listOfActiveJobs: continue if lineList[4] == "Eqw": listOfErrorJobs.append(jobId) return listOfErrorJobs
def getListOfActiveJobs(jobName): command = "condor_q -format \"%d.\" ClusterId -format \"%d\n\" ProcId " if jobName: command += "-constraint 'JobBatchName == \"{0}\"' ".format(jobName) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("condor_q failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n') jobs = [] for job in jobList: job = job.split() if len(job) > 0: try: jobID = int(job[0].rstrip(".0")) jobs.append(jobID) except ValueError: raise batchelor.BatchelorException( "Cannot parse return of condor_q (stdout: '" + stdout + "')") return jobs
def submitJob(config, command, outputFile, jobName, wd=None, arrayStart=None, arrayEnd=None, arrayStep=None): if arrayStart is not None or arrayEnd is not None or arrayStep is not None: raise BatchelorException( "Array jobs are not (yet) implementet for CERNs HTCondor system") filesDir = os.path.join(os.getcwd(), '.log') if " " in filesDir: raise BatchelorException( "Cannot handle submit directories with whitespaces") if not os.path.exists(filesDir): os.makedirs(filesDir) (fileDescriptor, submitFileName) = tempfile.mkstemp(dir=filesDir, prefix='submitFiles_', suffix='.submit') os.close(fileDescriptor) atexit.register(lambda: os.remove(submitFileName)) (fileDescriptor, scriptFileName) = tempfile.mkstemp(dir=filesDir, prefix='scriptFiles_', suffix='.sh') os.close(fileDescriptor) atexit.register(lambda: os.remove(scriptFileName)) os.chmod(scriptFileName, 0755) batchelor.runCommand("cp " + batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) + " " + scriptFileName) with open(scriptFileName, 'a') as scriptFile: scriptFile.write(command) with open(submitFileName, 'w') as submitFile: outputFile = os.path.abspath(outputFile) submitFile.write("executable = {0}\n".format(scriptFileName)) if outputFile: submitFile.write("output = {0}\n".format(outputFile)) submitFile.write("log = {0}.condor\n".format(outputFile)) submitFile.write("error = {0}.err\n".format(outputFile)) submitFile.write( "should_transfer_files = NO\n") # Disable file transport submitFile.write("request_cpus = 1\n") submitFile.write("request_memory = {0}\n".format( config.get(submoduleIdentifier(), "memory"))) submitFile.write("request_disk = {0}\n".format( config.get(submoduleIdentifier(), "disk"))) submitFile.write("+JobFlavour = \"{0}\"\n".format( config.get(submoduleIdentifier(), "flavour"))) submitFile.write("queue 1\n") cmnd = "condor_submit '{0}'".format(submitFileName) if jobName: cmnd += " -batch-name {0} ".format(jobName) kwargs = {} if wd: kwargs['wd'] = wd (returncode, stdout, stderr) = batchelor.runCommand(cmnd, **kwargs) if returncode != 0: raise batchelor.BatchelorException("condor_submit failed (stderr: '" + stderr + "')") jobId = stdout.split('\n')[1].split()[5].rstrip(".") try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing of condor_submit output to get job id failed.') return jobId
def getListOfJobStates(jobName, username=None, detailed=True): if detailed: command = "llq -u `whoami` -m -x" else: command = "llq -u `whoami` -m" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')") jobList = [] jobStates = [] currentJobId = -1 currentJobStatus = None for line in stdout.split('\n'): line = line.rstrip('\n') if line.startswith("===== Job Step mgmt."): try: currentJobId = int(line[line.find(".") + 1:line.rfind(".")]) currentJobStatus = JobStatus(currentJobId) except ValueError: raise batchelor.BatchelorException( "parsing of llq output to get job id failed.") line = ' '.join(line.split()) if line.startswith("Job Name: "): if currentJobId < 0: raise batchelor.BatchelorException( "parsing of llq output failed, got job name before job id." ) name = line[10:] if name == jobName or jobName == None: jobList.append(currentJobId) jobStates.append(currentJobStatus) elif line.startswith("Step Virtual Memory: "): if currentJobId < 0: raise batchelor.BatchelorException( "parsing of llq output failed, got job name before job id." ) try: parsed = line.lstrip().lstrip('Step Virtual Memory:').split() currentJobStatus.setMemoryUsage( float(parsed[0]) * _kMemoryUnits[parsed[1]], 0) except ValueError: raise batchelor.BatchelorException( "parsing of llq output to get job id failed.") elif line.startswith("Status: "): if currentJobId < 0: raise batchelor.BatchelorException( "parsing of llq output failed, got job name before job id." ) else: status = line.lstrip().lstrip("Status: ") currentJobStatus.setStatus(JobStatus.kUnknown, name=status) if status == 'Running': currentJobStatus.setStatus(JobStatus.kRunning) elif status == 'I' or status == 'Idle' or status == 'Pending': currentJobStatus.setStatus(JobStatus.kWaiting) elif status == 'Submission Error' or status == 'Terminated' or status == 'Removed' or status == 'Remove Pending': currentJobStatus.setStatus(JobStatus.kError) elif line.startswith("Step User Time: "): if currentJobId < 0: raise batchelor.BatchelorException( "parsing of llq output failed, got job name before job id." ) time_str = line.lstrip().lstrip("Step User Time:").split(':') try: hours = float(time_str[0]) minuts = float(time_str[1]) seconds = float(time_str[2]) total_time = hours + minuts / 60.0 + seconds / 3600.0 currentJobStatus.setCpuTime(total_time, 0) except ValueError: raise batchelor.BatchelorException( "parsing of llq output to get job id failed.") return jobStates
def getListOfJobStates(select_jobIDs, username): # get list of all jobs if username == None: command = "qstat" else: command = "qstat -u {0}".format(username) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n')[2:] try: jobIDs = [int(job.split()[0]) for job in jobList] jobStates = [job.split()[4] for job in jobList] except ValueError: raise batchelor.BatchelorException( "parsing of qstat output to get job id failed.") list_of_states = [] for i, jobID in enumerate(jobIDs): if select_jobIDs == None or jobID in select_jobIDs: job_status = JobStatus(jobID) job_status.setStatus(JobStatus.kUnknown, name=jobStates[i]) if jobStates[i] == 'qw' or jobStates[i] == 'hqw': job_status.setStatus(JobStatus.kWaiting) elif jobStates[i] == 't': job_status.setStatus(JobStatus.kTransmitting) elif jobStates[i] == 'd' or jobStates[i] == 'dr' or jobStates[ i] == 'dt': job_status.setStatus(JobStatus.kDeletion) elif jobStates[i] == 'Eq': job_status.setStatus(JobStatus.kError) elif jobStates[i] == 'r' or jobStates[i] == 'hr': # get detailed job information command = "qstat -xml -j {0}".format(jobID) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException( "qstat failed (stderr: '" + stderr + "')") elif 'unknown_jobs' in stdout: continue # the job has been ended between the qstat command and now else: try: root = ElementTree.fromstring(stdout) for child in root[0]: for task in child.findall('JB_ja_tasks'): for sublist in task.findall('ulong_sublist'): task_number = sublist.findall( 'JAT_task_number') if task_number: task_number = int(task_number[0].text) job_status.setStatus( JobStatus.kRunning) for usage_list in sublist.findall( 'JAT_scaled_usage_list'): for scaled in usage_list.findall( 'scaled'): name = scaled.findall( 'UA_name')[0].text value = scaled.findall( 'UA_value')[0].text if name == 'cpu': job_status.setCpuTime( float(value) / 3600.0, task_number) elif name == 'vmem': job_status.setMemoryUsage( float(value) / (1024.0)**3, task_number) except xml.etree.ElementTree.ParseError as e: raise batchelor.BatchelorException( "xml-parser could not parse output of qstat -xml -j {0}: {1}" .format(jobID, e)) # end of parsing through the xml tree list_of_states.append(job_status) # end of if jobs belongs to the selected jobs # end of loop over all jobs return list_of_states
def getListOfJobStates(select_jobIDs, username): # get list of all jobs if username == None: command = "qstat" else: command = "qstat -u {0}".format(username) (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") if stdout == "": return [] jobList = stdout.split('\n')[2:] try: jobIDs = [ int(job.split()[0]) for job in jobList ] jobStates = [ job.split()[4] for job in jobList ]; except ValueError: raise batchelor.BatchelorException("parsing of qstat output to get job id failed.") list_of_states = []; for i, jobID in enumerate(jobIDs): if select_jobIDs == None or jobID in select_jobIDs: job_status = JobStatus(jobID); job_status.setStatus( JobStatus.kUnknown, name = jobStates[i] ); if jobStates[i] == 'qw' or jobStates[i] == 'hqw': job_status.setStatus( JobStatus.kWaiting ); elif jobStates[i] == 't': job_status.setStatus( JobStatus.kTransmitting ) elif jobStates[i] == 'd' or jobStates[i] == 'dr' or jobStates[i] == 'dt': job_status.setStatus( JobStatus.kDeletion) elif jobStates[i] == 'Eq': job_status.setStatus( JobStatus.kError ); elif jobStates[i] == 'r' or jobStates[i] == 'hr': # get detailed job information command = "qstat -xml -j {0}".format(jobID); (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("qstat failed (stderr: '" + stderr + "')") elif 'unknown_jobs' in stdout: continue; # the job has been ended between the qstat command and now else: try: root = ElementTree.fromstring( stdout ); for child in root[0]: for task in child.findall('JB_ja_tasks'): for sublist in task.findall('ulong_sublist'): task_number = sublist.findall('JAT_task_number') if task_number: task_number = int(task_number[0].text) job_status.setStatus( JobStatus.kRunning ); for usage_list in sublist.findall('JAT_scaled_usage_list'): for scaled in usage_list.findall('scaled'): name = scaled.findall('UA_name')[0].text value = scaled.findall('UA_value')[0].text if name == 'cpu': job_status.setCpuTime(float(value) / 3600.0, task_number); elif name == 'vmem': job_status.setMemoryUsage(float(value) / (1024.0)**3, task_number); except xml.etree.ElementTree.ParseError as e: raise batchelor.BatchelorException("xml-parser could not parse output of qstat -xml -j {0}: {1}".format(jobID, e)) # end of parsing through the xml tree list_of_states.append( job_status ); # end of if jobs belongs to the selected jobs # end of loop over all jobs return list_of_states;
def submitJob(config, command, outputFile, jobName, wd=None, arrayStart=None, arrayEnd=None, arrayStep=None, priority=None, ompNumThreads=None): # some checks of the job-settings if wd and os.path.realpath(wd).count( os.path.realpath(os.path.expanduser('~'))): raise batchelor.BatchelorException( "The given working-directory is in your home-folder which is no allowed at E18: '{0}'" .format(wd)) if os.path.realpath(outputFile).count( os.path.realpath(os.path.expanduser('~'))): raise batchelor.BatchelorException( "The given output-file is in your home-folder which is no allowed at E18: '{0}'" .format(outputFile)) if priority: priority = max(int(-1024 + 2048 * (priority + 1.0) / 2.0), -1023) (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: if ompNumThreads is not None: scriptFile.write( "export OMP_NUM_THREADS={0}\n".format(ompNumThreads)) scriptFile.write(command) cmnd = "qsub " cmnd += "-j y " cmnd += "-b no " cmnd += "-m n " cmnd += "" if jobName is None else ("-N " + jobName + " ") if arrayStart is not None: cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str( arrayStep) + " " cmnd += "-o '" + outputFile + "' " cmnd += "-wd '" + ("/tmp/" if not wd else wd) + "' " if config.has_option(submoduleIdentifier(), "shortqueue") and config.get( submoduleIdentifier(), "shortqueue") in [1, "1", "TRUE", "true", "True"]: cmnd += "-l short=1 " elif config.has_option(submoduleIdentifier(), "longqueue") and config.get( submoduleIdentifier(), "longqueue") in [1, "1", "TRUE", "true", "True"]: cmnd += "-l long=1 " else: cmnd += "-l medium=1 " cmnd += "-l h_pmem=" + config.get(submoduleIdentifier(), "memory") + " " cmnd += "-l arch=" + config.get(submoduleIdentifier(), "arch") + " " cmnd += _getExcludedHostsString(config) cmnd += "-p {0} ".format(priority) if priority else "" cmnd += "-pe mt {0} ".format( ompNumThreads) if ompNumThreads is not None else "" cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')") # example output: "Your job 1601905 ("J2415c980b8") has been submitted" if arrayStart is not None: jobId = stdout.lstrip("Your job-array ") jobId = jobId[:jobId.find('.')] else: jobId = stdout.lstrip("Your job ") jobId = jobId[:jobId.find(' ')] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def getListOfJobStates(jobName, username = None, detailed = True): if detailed: command = "llq -u `whoami` -m -x" else: command = "llq -u `whoami` -m" (returncode, stdout, stderr) = batchelor.runCommand(command) if returncode != 0: raise batchelor.BatchelorException("llq failed (stderr: '" + stderr + "')") jobList = [] jobStates = [] currentJobId = -1 currentJobStatus = None; for line in stdout.split('\n'): line = line.rstrip('\n') if line.startswith("===== Job Step mgmt."): try: currentJobId = int(line[line.find(".")+1:line.rfind(".")]) currentJobStatus = JobStatus(currentJobId) except ValueError: raise batchelor.BatchelorException("parsing of llq output to get job id failed.") line = ' '.join(line.split()) if line.startswith("Job Name: "): if currentJobId < 0: raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.") name = line[10:] if name == jobName or jobName == None: jobList.append(currentJobId) jobStates.append(currentJobStatus) elif line.startswith("Step Virtual Memory: "): if currentJobId < 0: raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.") try: parsed = line.lstrip().lstrip('Step Virtual Memory:').split() currentJobStatus.setMemoryUsage( float(parsed[0]) * _kMemoryUnits[parsed[1]], 0) except ValueError: raise batchelor.BatchelorException("parsing of llq output to get job id failed.") elif line.startswith("Status: "): if currentJobId < 0: raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.") else: status = line.lstrip().lstrip("Status: ") currentJobStatus.setStatus(JobStatus.kUnknown, name = status) if status == 'Running': currentJobStatus.setStatus(JobStatus.kRunning) elif status == 'I' or status == 'Idle' or status == 'Pending': currentJobStatus.setStatus(JobStatus.kWaiting) elif status == 'Submission Error' or status == 'Terminated' or status == 'Removed' or status == 'Remove Pending': currentJobStatus.setStatus(JobStatus.kError) elif line.startswith("Step User Time: "): if currentJobId < 0: raise batchelor.BatchelorException("parsing of llq output failed, got job name before job id.") time_str = line.lstrip().lstrip("Step User Time:").split(':') try: hours = float(time_str[0]) minuts = float(time_str[1]) seconds = float(time_str[2]) total_time = hours + minuts / 60.0 + seconds / 3600.0 currentJobStatus.setCpuTime(total_time, 0) except ValueError: raise batchelor.BatchelorException("parsing of llq output to get job id failed.") return jobStates
def _submitJob(config, command, outputFile, jobName, wd=None, nTasks=None): # check if only a certain amount of active jobs is allowd if config.has_option(submoduleIdentifier(), "max_active_jobs"): max_active_jobs = int( config.get(submoduleIdentifier(), "max_active_jobs")) i = 0 waitTime = 90 while True: try: nRunningJobs = len(getListOfActiveJobs(None)) except batchelor.BatchelorException: nRunningJobs = max_active_jobs if nRunningJobs < max_active_jobs: break if i == 0: sys.stdout.write("Waiting for free slots") sys.stdout.flush() time.sleep(waitTime) # wait 1.5 min i += 1 if i > 0: sys.stdout.write("\r") if wd == None: wd = os.getcwd() (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#SBATCH -D " + wd + "\n") tempFile.write("#SBATCH -o " + outputFile + "\n") tempFile.write("#SBATCH --time=" + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") if config.get(submoduleIdentifier(), "clusters") != 'mpp3': tempFile.write("#SBATCH --mem-per-cpu=" + config.get(submoduleIdentifier(), "memory") + "\n") if jobName is not None: tempFile.write("#SBATCH -J " + jobName + "\n") tempFile.write("#SBATCH --get-user-env \n") tempFile.write("#SBATCH --export=NONE \n") if nTasks is not None: if config.get(submoduleIdentifier(), "clusters") != 'mpp3': tempFile.write("#SBATCH --ntasks={0:d} \n".format(nTasks)) else: tempFile.write("#SBATCH --nodes={0:d} \n".format( (nTasks + 63) // 64)) tempFile.write("#SBATCH --ntasks-per-node={0} \n".format( config.get(submoduleIdentifier(), "n_tasks_per_node"))) tempFile.write("#SBATCH --clusters={0}\n".format( config.get(submoduleIdentifier(), "clusters"))) if config.get(submoduleIdentifier(), "clusters") not in ['cm2_tiny', 'mpp3']: tempFile.write("#SBATCH --partition={0}\n\n".format( config.get(submoduleIdentifier(), "partition"))) if config.get(submoduleIdentifier(), "clusters") == 'cm2' or config.get( submoduleIdentifier(), "clusters") == 'c2pap': tempFile.write("#SBATCH --qos={0}\n\n".format( config.get(submoduleIdentifier(), "partition"))) tempFile.write("module load slurm_setup \n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write(command) cmnd = "sbatch " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) batchelor.runCommand("rm -f " + fileName) if returncode != 0: raise batchelor.BatchelorException("sbatch failed (stderr: '" + stderr + "')") jobId = stdout.split()[3] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing output of sbatch to get job id failed.') return jobId