def submitJob(config, command, outputFile, jobName, arrayStart = None, arrayEnd = None, arrayStep = None): (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: scriptFile.write(command) cmnd = "qsub " cmnd += "-j y " cmnd += "" if jobName is None else ("-N " + jobName + " ") if arrayStart is not None: cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(arrayStep) + " " cmnd += "-o " + outputFile + " " cmnd += "-P " + config.get(submoduleIdentifier(), "project") + " " cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " " cmnd += "-l h_vmem=" + config.get(submoduleIdentifier(), "memory") + " " cmnd += _getExcludedHostsString(config) cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')") # example output: "Your job 1601905 ("J2415c980b8") has been submitted" if arrayStart is not None: jobId = stdout.lstrip("Your job-array ") jobId = jobId[:jobId.find('.')] else: jobId = stdout.lstrip("Your job ") jobId = jobId[:jobId.find(' ')] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def submitJob(config, command, outputFile, jobName, wd=None): # check if only a certain amount of active jobs is allowd if config.has_option(submoduleIdentifier(), "max_active_jobs"): max_active_jobs = int( config.get(submoduleIdentifier(), "max_active_jobs")) i = 0 waitTime = 90 while len(getListOfActiveJobs(None)) >= max_active_jobs: if i == 0: sys.stdout.write("Waiting for free slots") sys.stdout.flush() time.sleep(waitTime) # wait 1.5 min i += 1 if i > 0: sys.stdout.write("\r") if wd == None: wd = os.getcwd() (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#SBATCH -D " + wd + "\n") tempFile.write("#SBATCH -o " + outputFile + "\n") tempFile.write("#SBATCH --time=" + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") tempFile.write("#SBATCH --mem=" + config.get(submoduleIdentifier(), "memory") + "\n") if jobName is not None: tempFile.write("#SBATCH -J " + jobName + "\n") tempFile.write("#SBATCH --get-user-env \n") tempFile.write("#SBATCH --export=NONE \n") tempFile.write("#SBATCH --clusters=serial \n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write(command) cmnd = "sbatch " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) batchelor.runCommand("rm -f " + fileName) if returncode != 0: raise batchelor.BatchelorException("sbatch failed (stderr: '" + stderr + "')") jobId = stdout.split()[3] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing output of sbatch to get job id failed.') return jobId
def submitJob(config, command, outputFile, jobName, wd = None): if wd: raise batchelor.BatchelorException("Choosing the working directory is not jet implemented for {0}".format(submoduleIdentifier())) (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#@ group = " + config.get(submoduleIdentifier(), "group") + "\n") tempFile.write("#@ output = " + outputFile + "\n") tempFile.write("#@ error = " + outputFile + "\n") tempFile.write("#@ notification = " + config.get(submoduleIdentifier(), "notification") + "\n") tempFile.write("#@ notify_user = "******"notify_user") + "\n") tempFile.write("#@ node_usage = " + config.get(submoduleIdentifier(), "node_usage") + "\n") tempFile.write("#@ wall_clock_limit = " + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") tempFile.write("#@ resources = " + config.get(submoduleIdentifier(), "resources") + "\n") tempFile.write("#@ job_type = " + config.get(submoduleIdentifier(), "job_type") + "\n") tempFile.write("#@ class = " + config.get(submoduleIdentifier(), "job_type") + "\n") if jobName is not None: tempFile.write("#@ job_name = " + jobName + "\n") tempFile.write("#@ queue\n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write("exec 2>&1\n") tempFile.write("\n") tempFile.write(command) cmnd = "llsubmit - < " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException("llsubmit failed (stderr: '" + stderr + "')") # example output stdout: # llsubmit: The job "mgmt.12309" has been submitted. # # example output stderr: # # llsubmit: Stdin job command file written to "/tmp/loadlx_stdin.27558.CdoVxX". # # INFO: Project: pr83mo # INFO: Project's Expiration Date: 2015-01-31 # INFO: Budget: Total [cpuh] Used [cpuh] Credit [cpuh] # INFO: 1350000 1011028 (75%) 338972 (25%) # # llsubmit: Processed command file through Submit Filter: "/lrz/loadl/filter/submit_filter_c2pap.pl". jobId = stdout.split("\n")[0] jobId = jobId[jobId.find('"mgmt.')+6:jobId.rfind('"')] try: jobId = int(jobId) except ValueError: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException('parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def submitJob(config, command, outputFile, jobName, wd = None, arrayStart = None, arrayEnd = None, arrayStep = None, priority=None, ompNumThreads=None): # some checks of the job-settings if wd and os.path.realpath(wd).count(os.path.realpath(os.path.expanduser('~'))): raise batchelor.BatchelorException("The given working-directory is in your home-folder which is no allowed at E18: '{0}'".format(wd)) if os.path.realpath(outputFile).count(os.path.realpath(os.path.expanduser('~'))): raise batchelor.BatchelorException("The given output-file is in your home-folder which is no allowed at E18: '{0}'".format(outputFile)) if priority: priority = max(int(-1024 + 2048 * (priority+1.0)/2.0), -1023) (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: if ompNumThreads is not None: scriptFile.write("export OMP_NUM_THREADS={0}\n".format(ompNumThreads)) scriptFile.write(command) cmnd = "qsub " cmnd += "-j y " cmnd += "-b no " cmnd += "-m n " cmnd += "" if jobName is None else ("-N " + jobName + " ") if arrayStart is not None: cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(arrayStep) + " " cmnd += "-o '" + outputFile + "' " cmnd += "-wd '" + ("/tmp/" if not wd else wd) + "' " if config.has_option(submoduleIdentifier(), "shortqueue") and config.get(submoduleIdentifier(), "shortqueue") in [1, "1", "TRUE", "true", "True"]: cmnd += "-l short=1 " elif config.has_option(submoduleIdentifier(), "longqueue") and config.get(submoduleIdentifier(), "longqueue") in [1, "1", "TRUE", "true", "True"]: cmnd += "-l long=1 " else: cmnd += "-l medium=1 " cmnd += "-l h_pmem=" + config.get(submoduleIdentifier(), "memory") + " " cmnd += "-l arch=" + config.get(submoduleIdentifier(), "arch") + " " cmnd += _getExcludedHostsString(config) cmnd += "-p {0} ".format(priority) if priority else "" cmnd += "-pe mt {0} ".format(ompNumThreads) if ompNumThreads is not None else "" cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')") # example output: "Your job 1601905 ("J2415c980b8") has been submitted" if arrayStart is not None: jobId = stdout.lstrip("Your job-array ") jobId = jobId[:jobId.find('.')] else: jobId = stdout.lstrip("Your job ") jobId = jobId[:jobId.find(' ')] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def submitJob(config, command, outputFile, jobName, wd=None, arrayStart=None, arrayEnd=None, arrayStep=None): (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: scriptFile.write(command) if arrayStart is not None: if (jobName is None) or (len(jobName) is 0): jobName = ''.join(random.sample(string.lowercase, 7)) jobName = jobName + "[" + str(arrayStart) + "-" + str( arrayEnd) + ":" + str(arrayStep) + "]" cmnd = "bsub " cmnd += "" if jobName is None else ("-J " + jobName + " ") cmnd += "-o " + outputFile + " " cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " " cmnd += "-R '" cmnd += "-cwd '{0}'".format(wd) if wd else "" cmnd += " select[type=" + config.get(submoduleIdentifier(), "type") + "]" cmnd += " rusage[pool=" + config.get(submoduleIdentifier(), "pool") + "]" try: cmnd += " rusage[mem=" + config.get(submoduleIdentifier(), "memory") + "]" cmnd += " select[maxmem>" + config.get(submoduleIdentifier(), "memory") + "]" except ConfigParser.NoOptionError: pass cmnd += _getExcludedHostsString(config) cmnd += "' " cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("bsub failed (stderr: '" + stderr + "')") # example output: Job <533476534> is submitted to queue <1nd>. jobId = stdout.lstrip("Job <") jobId = jobId[:jobId.find(">")] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing of bsub output to get job id failed.') batchelor.runCommand('rm -f ' + fileName) return jobId
def submitJob(config, command, outputFile, jobName): (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#@ group = " + config.get(submoduleIdentifier(), "group") + "\n") tempFile.write("#@ output = " + outputFile + "\n") tempFile.write("#@ notification = " + config.get(submoduleIdentifier(), "notification") + "\n") tempFile.write("#@ notify_user = "******"notify_user") + "\n") tempFile.write("#@ node_usage = " + config.get(submoduleIdentifier(), "node_usage") + "\n") tempFile.write("#@ wall_clock_limit = " + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") tempFile.write("#@ resources = " + config.get(submoduleIdentifier(), "resources") + "\n") tempFile.write("#@ job_type = " + config.get(submoduleIdentifier(), "job_type") + "\n") tempFile.write("#@ class = " + config.get(submoduleIdentifier(), "job_type") + "\n") if jobName is not None: tempFile.write("#@ job_name = " + jobName + "\n") tempFile.write("#@ queue\n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write(command) cmnd = "llsubmit - < " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException("llsubmit failed (stderr: '" + stderr + "')") # example output stdout: # llsubmit: The job "mgmt.12309" has been submitted. # # example output stderr: # # llsubmit: Stdin job command file written to "/tmp/loadlx_stdin.27558.CdoVxX". # # INFO: Project: pr83mo # INFO: Project's Expiration Date: 2015-01-31 # INFO: Budget: Total [cpuh] Used [cpuh] Credit [cpuh] # INFO: 1350000 1011028 (75%) 338972 (25%) # # llsubmit: Processed command file through Submit Filter: "/lrz/loadl/filter/submit_filter_c2pap.pl". jobId = stdout.split("\n")[0] jobId = jobId[jobId.find('"mgmt.')+6:jobId.rfind('"')] try: jobId = int(jobId) except ValueError: batchelor.runCommand("rm -f " + fileName) raise batchelor.BatchelorException('parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def submitJob(config, command, outputFile, jobName, wd = None): # check if only a certain amount of active jobs is allowd if config.has_option(submoduleIdentifier(), "max_active_jobs"): max_active_jobs = int(config.get(submoduleIdentifier(), "max_active_jobs")) i=0; waitTime = 90 while len(getListOfActiveJobs(None)) >= max_active_jobs: if i == 0: sys.stdout.write("Waiting for free slots") sys.stdout.flush() time.sleep(waitTime); # wait 1.5 min i+=1 if i > 0: sys.stdout.write("\r") if wd == None: wd = os.getcwd() (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#SBATCH -D " + wd + "\n") tempFile.write("#SBATCH -o " + outputFile + "\n") tempFile.write("#SBATCH --time=" + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") tempFile.write("#SBATCH --mem=" + config.get(submoduleIdentifier(), "memory") + "\n") if jobName is not None: tempFile.write("#SBATCH -J " + jobName + "\n") tempFile.write("#SBATCH --get-user-env \n") tempFile.write("#SBATCH --export=NONE \n") tempFile.write("#SBATCH --clusters=serial \n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write(command) cmnd = "sbatch " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) batchelor.runCommand("rm -f " + fileName) if returncode != 0: raise batchelor.BatchelorException("sbatch failed (stderr: '" + stderr + "')") jobId = stdout.split()[3] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing output of sbatch to get job id failed.') return jobId
def submitJob(config, command, outputFile, jobName, wd = None, arrayStart = None, arrayEnd = None, arrayStep = None): if arrayStart is not None or arrayEnd is not None or arrayStep is not None: raise BatchelorException("Array jobs are not (yet) implementet for CERNs HTCondor system") filesDir = os.path.join(os.getcwd(), '.log') if " " in filesDir: raise BatchelorException("Cannot handle submit directories with whitespaces") if not os.path.exists(filesDir): os.makedirs(filesDir) (fileDescriptor, submitFileName) = tempfile.mkstemp(dir=filesDir, prefix='submitFiles_', suffix='.submit') os.close(fileDescriptor) atexit.register(lambda: os.remove( submitFileName )) (fileDescriptor, scriptFileName) = tempfile.mkstemp(dir=filesDir, prefix='scriptFiles_', suffix='.sh') os.close(fileDescriptor) atexit.register(lambda: os.remove( scriptFileName )) os.chmod(scriptFileName, 0755) batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + scriptFileName) with open(scriptFileName, 'a') as scriptFile: scriptFile.write(command) with open(submitFileName, 'w') as submitFile: outputFile = os.path.abspath(outputFile) submitFile.write("executable = {0}\n".format(scriptFileName)) if outputFile: submitFile.write("output = {0}\n".format(outputFile)) submitFile.write("log = {0}.condor\n".format(outputFile)) submitFile.write("error = {0}.err\n".format(outputFile)) submitFile.write("should_transfer_files = NO\n") # Disable file transport submitFile.write("request_cpus = 1\n") submitFile.write("request_memory = {0}\n".format(config.get(submoduleIdentifier(), "memory"))) submitFile.write("request_disk = {0}\n".format(config.get(submoduleIdentifier(), "disk"))) submitFile.write("+JobFlavour = \"{0}\"\n".format(config.get(submoduleIdentifier(), "flavour"))) submitFile.write("queue 1\n") cmnd = "condor_submit '{0}'".format(submitFileName) if jobName: cmnd += " -batch-name {0} ".format(jobName) kwargs = {} if wd: kwargs['wd'] = wd (returncode, stdout, stderr) = batchelor.runCommand(cmnd, **kwargs) if returncode != 0: raise batchelor.BatchelorException("condor_submit failed (stderr: '" + stderr + "')") jobId = stdout.split('\n')[1].split()[5].rstrip(".") try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing of condor_submit output to get job id failed.') return jobId
def submitJob(config, command, outputFile, jobName, wd=None, arrayStart=None, arrayEnd=None, arrayStep=None): if wd: raise batchelor.BatchelorException( "Choosing the working directory is not jet implemented for {0}". format(submoduleIdentifier())) (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: scriptFile.write(command) cmnd = "qsub " cmnd += "-j y " cmnd += "" if jobName is None else ("-N " + jobName + " ") if arrayStart is not None: cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str( arrayStep) + " " cmnd += "-o " + outputFile + " " cmnd += "-P " + config.get(submoduleIdentifier(), "project") + " " cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " " cmnd += "-l h_vmem=" + config.get(submoduleIdentifier(), "memory") + " " cmnd += _getExcludedHostsString(config) cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')") # example output: "Your job 1601905 ("J2415c980b8") has been submitted" if arrayStart is not None: jobId = stdout.lstrip("Your job-array ") jobId = jobId[:jobId.find('.')] else: jobId = stdout.lstrip("Your job ") jobId = jobId[:jobId.find(' ')] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def submitArrayJobs(config, commands, outputFile, jobName, wd=None): nTasksPerJob = int(config.get(submoduleIdentifier(), "n_tasks_per_job")) i = 0 jids = [] outputFileOrig = outputFile headerFileName = batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) with open(headerFileName, 'r') as headerFile: header = headerFile.read().replace(r'"', r'\"') while i < len(commands): j = min(len(commands), i + nTasksPerJob) nTasks = j - i srunConf = "\n".join( ["{i} {cmd}".format(i=ii, cmd=commands[ii]) for ii in range(i, j)]) srunConf = srunConf.replace(r'"', r'\"') tmpDir = os.path.join(os.environ['SCRATCH'], 'tmp') if not os.path.isdir(tmpDir): os.makedirs(tmpDir) fullCmd = 'tmpDir=$(mktemp -d -p {TMPDIR})\ntrap "rm -rf \'${{tmpDir}}\'" EXIT\n'.format( TMPDIR=tmpDir) fullCmd += 'echo "{srun}" > ${{tmpDir}}/srun.conf\n'.format( srun='\n'.join([ "{i} bash ${{tmpDir}}/{i}.sh".format(i=k) for k in range(nTasks) ])) for k, ii in enumerate(range(i, j)): fullCmd += 'echo "#!/bin/bash\n{header}\n{cmd}" > ${{tmpDir}}/{i}.sh\n'.format( header=header, cmd=commands[ii].replace(r'"', r'\"'), i=k) fullCmd += 'srun -n {nTasks} --multi-prog ${{tmpDir}}/srun.conf'.format( nTasks=nTasks) if outputFile != "/dev/null" and len(commands) > nTasksPerJob: outputFile = outputFileOrig + ".{0}_{1}".format(i, j) jid = _submitJob(config, fullCmd, outputFile, jobName, wd, nTasks=nTasks) jids += [jid] * nTasks i = j return jids
def submitJob(config, command, outputFile, jobName, wd = None, arrayStart = None, arrayEnd = None, arrayStep = None): (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath(config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: scriptFile.write(command) if arrayStart is not None: if (jobName is None) or (len(jobName) is 0): jobName = ''.join(random.sample(string.lowercase,7)) jobName = jobName + "[" + str(arrayStart) + "-" + str(arrayEnd) + ":" + str(arrayStep) + "]" cmnd = "bsub " cmnd += "" if jobName is None else ("-J " + jobName + " ") cmnd += "-o " + outputFile + " " cmnd += "-q " + config.get(submoduleIdentifier(), "queue") + " " cmnd += "-R '" cmnd += "-cwd '{0}'".format(wd) if wd else "" cmnd += " select[type=" + config.get(submoduleIdentifier(), "type") + "]" cmnd += " rusage[pool=" + config.get(submoduleIdentifier(), "pool") + "]" try: cmnd += " rusage[mem=" + config.get(submoduleIdentifier(), "memory") + "]" cmnd += " select[maxmem>" + config.get(submoduleIdentifier(), "memory") + "]" except ConfigParser.NoOptionError: pass cmnd += _getExcludedHostsString(config) cmnd += "' " cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("bsub failed (stderr: '" + stderr + "')") # example output: Job <533476534> is submitted to queue <1nd>. jobId = stdout.lstrip("Job <") jobId = jobId[:jobId.find(">")] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException('parsing of bsub output to get job id failed.') batchelor.runCommand('rm -f ' + fileName) return jobId
def submitJob(config, command, outputFile, jobName, wd=None, arrayStart=None, arrayEnd=None, arrayStep=None, priority=None, ompNumThreads=None): # some checks of the job-settings if wd and os.path.realpath(wd).count( os.path.realpath(os.path.expanduser('~'))): raise batchelor.BatchelorException( "The given working-directory is in your home-folder which is no allowed at E18: '{0}'" .format(wd)) if os.path.realpath(outputFile).count( os.path.realpath(os.path.expanduser('~'))): raise batchelor.BatchelorException( "The given output-file is in your home-folder which is no allowed at E18: '{0}'" .format(outputFile)) if priority: priority = max(int(-1024 + 2048 * (priority + 1.0) / 2.0), -1023) (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) batchelor.runCommand("cp " + batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) + " " + fileName) with open(fileName, 'a') as scriptFile: if ompNumThreads is not None: scriptFile.write( "export OMP_NUM_THREADS={0}\n".format(ompNumThreads)) scriptFile.write(command) cmnd = "qsub " cmnd += "-j y " cmnd += "-b no " cmnd += "-m n " cmnd += "" if jobName is None else ("-N " + jobName + " ") if arrayStart is not None: cmnd += "-t " + str(arrayStart) + "-" + str(arrayEnd) + ":" + str( arrayStep) + " " cmnd += "-o '" + outputFile + "' " cmnd += "-wd '" + ("/tmp/" if not wd else wd) + "' " if config.has_option(submoduleIdentifier(), "shortqueue") and config.get( submoduleIdentifier(), "shortqueue") in [1, "1", "TRUE", "true", "True"]: cmnd += "-l short=1 " elif config.has_option(submoduleIdentifier(), "longqueue") and config.get( submoduleIdentifier(), "longqueue") in [1, "1", "TRUE", "true", "True"]: cmnd += "-l long=1 " else: cmnd += "-l medium=1 " cmnd += "-l h_pmem=" + config.get(submoduleIdentifier(), "memory") + " " cmnd += "-l arch=" + config.get(submoduleIdentifier(), "arch") + " " cmnd += _getExcludedHostsString(config) cmnd += "-p {0} ".format(priority) if priority else "" cmnd += "-pe mt {0} ".format( ompNumThreads) if ompNumThreads is not None else "" cmnd += "< " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) if returncode != 0: raise batchelor.BatchelorException("qsub failed (stderr: '" + stderr + "')") # example output: "Your job 1601905 ("J2415c980b8") has been submitted" if arrayStart is not None: jobId = stdout.lstrip("Your job-array ") jobId = jobId[:jobId.find('.')] else: jobId = stdout.lstrip("Your job ") jobId = jobId[:jobId.find(' ')] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing of qsub output to get job id failed.') batchelor.runCommand("rm -f " + fileName) return jobId
def submitJob(config, command, outputFile, jobName, wd=None, arrayStart=None, arrayEnd=None, arrayStep=None): if arrayStart is not None or arrayEnd is not None or arrayStep is not None: raise BatchelorException( "Array jobs are not (yet) implementet for CERNs HTCondor system") filesDir = os.path.join(os.getcwd(), '.log') if " " in filesDir: raise BatchelorException( "Cannot handle submit directories with whitespaces") if not os.path.exists(filesDir): os.makedirs(filesDir) (fileDescriptor, submitFileName) = tempfile.mkstemp(dir=filesDir, prefix='submitFiles_', suffix='.submit') os.close(fileDescriptor) atexit.register(lambda: os.remove(submitFileName)) (fileDescriptor, scriptFileName) = tempfile.mkstemp(dir=filesDir, prefix='scriptFiles_', suffix='.sh') os.close(fileDescriptor) atexit.register(lambda: os.remove(scriptFileName)) os.chmod(scriptFileName, 0755) batchelor.runCommand("cp " + batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) + " " + scriptFileName) with open(scriptFileName, 'a') as scriptFile: scriptFile.write(command) with open(submitFileName, 'w') as submitFile: outputFile = os.path.abspath(outputFile) submitFile.write("executable = {0}\n".format(scriptFileName)) if outputFile: submitFile.write("output = {0}\n".format(outputFile)) submitFile.write("log = {0}.condor\n".format(outputFile)) submitFile.write("error = {0}.err\n".format(outputFile)) submitFile.write( "should_transfer_files = NO\n") # Disable file transport submitFile.write("request_cpus = 1\n") submitFile.write("request_memory = {0}\n".format( config.get(submoduleIdentifier(), "memory"))) submitFile.write("request_disk = {0}\n".format( config.get(submoduleIdentifier(), "disk"))) submitFile.write("+JobFlavour = \"{0}\"\n".format( config.get(submoduleIdentifier(), "flavour"))) submitFile.write("queue 1\n") cmnd = "condor_submit '{0}'".format(submitFileName) if jobName: cmnd += " -batch-name {0} ".format(jobName) kwargs = {} if wd: kwargs['wd'] = wd (returncode, stdout, stderr) = batchelor.runCommand(cmnd, **kwargs) if returncode != 0: raise batchelor.BatchelorException("condor_submit failed (stderr: '" + stderr + "')") jobId = stdout.split('\n')[1].split()[5].rstrip(".") try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing of condor_submit output to get job id failed.') return jobId
def _submitJob(config, command, outputFile, jobName, wd=None, nTasks=None): # check if only a certain amount of active jobs is allowd if config.has_option(submoduleIdentifier(), "max_active_jobs"): max_active_jobs = int( config.get(submoduleIdentifier(), "max_active_jobs")) i = 0 waitTime = 90 while True: try: nRunningJobs = len(getListOfActiveJobs(None)) except batchelor.BatchelorException: nRunningJobs = max_active_jobs if nRunningJobs < max_active_jobs: break if i == 0: sys.stdout.write("Waiting for free slots") sys.stdout.flush() time.sleep(waitTime) # wait 1.5 min i += 1 if i > 0: sys.stdout.write("\r") if wd == None: wd = os.getcwd() (fileDescriptor, fileName) = tempfile.mkstemp() os.close(fileDescriptor) headerFileName = batchelor._getRealPath( config.get(submoduleIdentifier(), "header_file")) with open(fileName, 'w') as tempFile: tempFile.write("#!/bin/bash\n\n") tempFile.write("#SBATCH -D " + wd + "\n") tempFile.write("#SBATCH -o " + outputFile + "\n") tempFile.write("#SBATCH --time=" + config.get(submoduleIdentifier(), "wall_clock_limit") + "\n") if config.get(submoduleIdentifier(), "clusters") != 'mpp3': tempFile.write("#SBATCH --mem-per-cpu=" + config.get(submoduleIdentifier(), "memory") + "\n") if jobName is not None: tempFile.write("#SBATCH -J " + jobName + "\n") tempFile.write("#SBATCH --get-user-env \n") tempFile.write("#SBATCH --export=NONE \n") if nTasks is not None: if config.get(submoduleIdentifier(), "clusters") != 'mpp3': tempFile.write("#SBATCH --ntasks={0:d} \n".format(nTasks)) else: tempFile.write("#SBATCH --nodes={0:d} \n".format( (nTasks + 63) // 64)) tempFile.write("#SBATCH --ntasks-per-node={0} \n".format( config.get(submoduleIdentifier(), "n_tasks_per_node"))) tempFile.write("#SBATCH --clusters={0}\n".format( config.get(submoduleIdentifier(), "clusters"))) if config.get(submoduleIdentifier(), "clusters") not in ['cm2_tiny', 'mpp3']: tempFile.write("#SBATCH --partition={0}\n\n".format( config.get(submoduleIdentifier(), "partition"))) if config.get(submoduleIdentifier(), "clusters") == 'cm2' or config.get( submoduleIdentifier(), "clusters") == 'c2pap': tempFile.write("#SBATCH --qos={0}\n\n".format( config.get(submoduleIdentifier(), "partition"))) tempFile.write("module load slurm_setup \n\n\n") with open(headerFileName, 'r') as headerFile: for line in headerFile: if line.startswith("#!"): continue tempFile.write(line) tempFile.write("\n\n") tempFile.write(command) cmnd = "sbatch " + fileName (returncode, stdout, stderr) = batchelor.runCommand(cmnd) batchelor.runCommand("rm -f " + fileName) if returncode != 0: raise batchelor.BatchelorException("sbatch failed (stderr: '" + stderr + "')") jobId = stdout.split()[3] try: jobId = int(jobId) except ValueError: raise batchelor.BatchelorException( 'parsing output of sbatch to get job id failed.') return jobId