class poe(Plugin): def __init__(self, logFileName): self.__log= Logger(logFileName) self.__failedPollTimes = 0 def isLocalProcess(self): return True def getName(self): return 'poe' def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30): return None def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None, dumpEventOutputs=False): submit_script = "#!/bin/bash -l" + "\n" submit_script += "#@ tasks_per_node = 1" + "\n" submit_script += "source /etc/profile.d/modules.sh" + "\n" submit_script += "module load mpi4py" + "\n" if localSetup: submit_script += localSetup + "\n" submit_script += "source ${VO_ATLAS_SW_DIR}/local/setup-yampl.sh" + "\n" submit_script += "export PYTHONPATH=/cvmfs/atlas.cern.ch/repo/sw/local/noarch/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n" submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir #submit_script += "export CMTEXTRATAGS=ATLAS,useDBRelease" + "\n" submit_script += "env" + "\n" # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt" # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir # submit_script += "mpirun --host "+nodelist+" python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>" + globalYodaDir+ "/yoda_stdout.txt 2>" + globalYodaDir+ "/yoda_stderr.txt" submit_script += "poe parrot_run python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" --outputDir=" + os.path.dirname(globalYodaDir) + " --dumpEventOutputs" self.__log.debug("POE submit script: %s" % submit_script) # hpcJob = subprocess.Popen(submit_script, stdout=sys.stdout, stderr=sys.stdout, shell=True) yoda_stdout = open(os.path.join(globalYodaDir, 'yoda_stdout.txt'), 'a') yoda_stderr = open(os.path.join(globalYodaDir, 'yoda_stderr.txt'), 'a') hpcJob = subprocess.Popen(submit_script, stdout=yoda_stdout, stderr=yoda_stderr, shell=True) t1 = time.time() i = 20 while (hpcJob and hpcJob.poll() is None): if i == 0: self.__log.debug("Yoda process is running") i = 20 time.sleep(30) i -= 1 self.__log.debug("Yoda process terminated") self.__log.debug("Yoda process return code: %s" % hpcJob.returncode) return 0, None def poll(self, jobid): return None
def __init__(self, logFileName): self.__log= Logger(logFileName) self.__failedPollTimes = 0
class arc(Plugin): def __init__(self, logFileName): self.__log= Logger(logFileName) self.__failedPollTimes = 0 def isLocalProcess(self): return True def getName(self): return 'arc' def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30): return None def convertNodeList(self, nodelist): try: if '[' in nodelist: numNames = [] tmp = nodelist preName, numList = tmp.split('[') numList,postName = numList.split(']') for items in numList.split(","): if not '-' in items: numNames.append(preName + items + postName) else: start, end = items.split('-') numLen = len(start) for i in range(int(start), int(end) + 1): num = str(i).zfill(numLen) numNames.append(preName + str(num) + postName) return ','.join(numNames) else: return nodelist except: self.__log.debug(traceback.format_exc()) return nodelist def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None): nodelist = "" if os.environ.has_key('SLURM_NODELIST'): nodelist = os.environ['SLURM_NODELIST'] elif os.environ.has_key('PBS_NODELIST'): nodelist = os.environ['PBS_NODELIST'] nodelist = self.convertNodeList(nodelist) submit_script = "#!/bin/bash -l" + "\n" # submit_script += "module load mpi4py openmpi-ccm" + "\n" if localSetup: submit_script += localSetup + "\n" submit_script += "source ${VO_ATLAS_SW_DIR}/local/setup-yampl.sh" + "\n" submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir submit_script += "env" + "\n" # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt" # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir # submit_script += "mpirun --host "+nodelist+" python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir # submit_script += "mpirun -bynode python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt" # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>" + globalYodaDir+ "/yoda_stdout.txt 2>" + globalYodaDir+ "/yoda_stderr.txt" submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" --nonMPIMode" self.__log.debug("ARC submit script: %s" % submit_script) # hpcJob = subprocess.Popen(submit_script, stdout=sys.stdout, stderr=sys.stdout, shell=True) yoda_stdout = open(os.path.join(globalYodaDir, 'yoda_stdout.txt'), 'a') yoda_stderr = open(os.path.join(globalYodaDir, 'yoda_stderr.txt'), 'a') hpcJob = subprocess.Popen(submit_script, stdout=yoda_stdout, stderr=yoda_stderr, shell=True) i = 20 while (hpcJob and hpcJob.poll() is None): if i == 0: self.__log.debug("Yoda process is running%s") i = 20 time.sleep(30) i -= 1 self.__log.debug("Yoda process terminated") self.__log.debug("Yoda process return code: %s" % hpcJob.returncode) return 0, None def poll(self, jobid): return None
def __init__(self, logFileName): self.__log = Logger(logFileName) self.__failedPollTimes = 0
class slurm(Plugin): def __init__(self, logFileName): self.__log = Logger(logFileName) self.__failedPollTimes = 0 def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30): # copied from RunJobEdison #cmd = 'showbf -p %s' % partition cmd = 'sinfo ' self.__log.info("Executing command: '%s'" % cmd) res_tuple = runcommand(cmd) self.__log.info("Executing command output: %s" % str(res_tuple)) showbf_str = "" if res_tuple[0] == 0: showbf_str = res_tuple[1] res = {} self.__log.info("Available resources in %s partition" % partition) self.__log.info(showbf_str) if showbf_str: shobf_out = showbf_str.splitlines() self.__log.info("Fitted resources") for l in shobf_out[2:]: d = l.split() nodes = int(d[2]) if nodes < int(min_nodes): continue if not d[3] == 'INFINITY': wal_time_arr = d[3].split(":") if len(wal_time_arr) < 4: wal_time_sec = int(wal_time_arr[0]) * (60 * 60) + int( wal_time_arr[1]) * 60 + int(wal_time_arr[2]) if wal_time_sec > 24 * 3600: wal_time_sec = 24 * 3600 else: wal_time_sec = 24 * 3600 #if nodes > 1: # nodes = nodes - 1 else: wal_time_sec = 12 * 3600 # Fitting Hopper policy # https://www.nersc.gov/users/computational-systems/hopper/running-jobs/queues-and-policies/ nodes = max_nodes if nodes > max_nodes else nodes if nodes < 682 and wal_time_sec > 48 * 3600: wal_time_sec = 48 * 3600 elif nodes < 4096 and wal_time_sec > 36 * 3600: wal_time_sec = 36 * 3600 elif nodes < 5462 and wal_time_sec > 12 * 3600: wal_time_sec = 12 * 3600 elif wal_time_sec > 12 * 3600: wal_time_sec = 12 * 3600 if wal_time_sec < int(min_walltime_m) * 60: continue self.__log.info( "Nodes: %s, Walltime (str): %s, Walltime (min) %s" % (nodes, d[3], wal_time_sec / 60)) res.update({nodes: wal_time_sec}) else: self.__log.info( "No availble resources. Default values will be used.") self.__log.info("Get resources: %s" % res) return res def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None, dumpEventOutputs=False): submit_script = "#!/bin/bash -l" + "\n" if queue == 'premium': submit_script += "#SBATCH -p regular\n" submit_script += "#SBATCH --qos=premium\n" elif queue == "scavenger": submit_script += "#SBATCH -p regular\n" submit_script += "#SBATCH --qos=scavenger\n" elif queue == "low": submit_script += "#SBATCH -p regular\n" submit_script += "#SBATCH --qos=low\n" else: submit_script += "#SBATCH -p " + queue + "\n" if repo: submit_script += "#SBATCH -A " + repo + "\n" # submit_script += "#SBATCH -n " + str(mppwidth) + "\n" submit_script += "#SBATCH -N " + str(nodes) + "\n" submit_script += "#SBATCH --signal=SIGUSR1@60\n" submit_script += "#SBATCH -t " + walltime + "\n" submit_script += "#SBATCH --ntasks-per-node=1\n" submit_script += "#SBATCH --cpus-per-task=" + str(cpuPerNode) + "\n" submit_script += "#SBATCH -J ES_job" + "\n" submit_script += "#SBATCH -o athena_stdout.txt" + "\n" submit_script += "#SBATCH -e athena_stderr.txt" + "\n" submit_script += "cd $SBATCH_O_WORKDIR" + "\n" submit_script += "module load mpi4py" + "\n" if localSetup: submit_script += localSetup + "\n" #submit_script += "source /project/projectdirs/atlas/sw/python-yampl/setup.sh" + "\n" #submit_script += "export PYTHONPATH=/project/projectdirs/atlas/sw/python-yampl/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n" submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir #submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/boto/lib/python2.6/site-packages:$PYTHONPATH\n" #submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/external:$PYTHONPATH\n" #submit_script += "export LD_LIBRARY_PATH=/project/projectdirs/atlas/sw/python-yampl/yampl/1.0/lib:$LD_LIBRARY_PATH" + "\n" #submit_script += "export X509_USER_PROXY=/global/homes/w/wguan/x509up_u23959" + "\n" #submit_script += "export X509_CERT_DIR=/project/projectdirs/atlas/pilot/grid_env/external/grid-security/certificates" + "\n" submit_script += "env" + "\n" # submit_script += "module avail" + "\n" # submit_script += "module list" + "\n" #submit_script += "srun -n " + str(nodes) + " -N " + str(mppnppn) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+"" submit_script += "srun -N " + str( nodes ) + " python-mpi " + os.path.join( globalWorkingDir, "HPC/HPCJob.py" ) + " --globalWorkingDir=" + globalYodaDir + " --localWorkingDir=" + localWorkingDir if dumpEventOutputs: submit_script += " --dumpEventOutputs" ###cmd = "mpiexec -n 2 python " + os.path.join(self.__globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+self.__globalWorkingDir+" --localWorkingDir="+self.__localWorkingDir+"&" self.__submit_file = os.path.join(globalYodaDir, 'submit_script') handle = open(self.__submit_file, 'w') handle.write(submit_script) handle.close() self.__log.info("submit script:\n%s" % submit_script) cmd = "sbatch " + self.__submit_file self.__log.info("submitting HPC job: %s" % cmd) status, output = runcommand(cmd) self.__log.info("submitting HPC job: (status: %s, output: %s)" % (status, output)) self.__jobid = None if status == 0: self.__jobid = output.replace("\n", "").split(" ")[-1] return 0, self.__jobid return -1, None def poll(self, jobid): # poll the job in HPC. update it cmd = "scontrol show job " + jobid self.__log.info("polling HPC job: %s" % cmd) status, output = runcommand(cmd) # self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output)) if status == 0: self.__failedPollTimes = 0 state = None lines = output.split("\n") for line in lines: line = line.strip() if line.startswith('JobState'): state = line.split(" ")[0].split("=")[1] if state == "COMPLETED": self.__log.info("HPC job complete") return "Complete" if state == "RUNNING": self.__log.info("HPC job is running") return "Running" if state == "PENDING": self.__log.info("HPC job is pending") return "Queue" if state == "FAILED": self.__log.info("HPC job is failed") return "Failed" if state == "CANCELLED": self.__log.info("HPC job is cancelled") return "Failed" if state == "TIMEOUT": self.__log.info("HPC job is timed out") return "Failed" self.__log.info("HPC job is in unknown state") return 'Unknown' else: self.__log.info("polling HPC job: (status: %s, output: %s)" % (status, output)) if 'Invalid job id specified' in output: self.__log.info("Unknown Job Id. Set Job Complete.") return "Complete" else: self.__failedPollTimes += 1 self.__log.error( 'Failing HPC job because the polling command has failed ' + str(self.__failedPollTimes) + ' times.') return 'Unknown' return 'Unknown' def delete(self, jobid): command = "scancel " + jobid status, output = runcommand(command) self.__log.debug("Run Command: %s " % command) self.__log.debug("Status: %s, Output: %s" % (status, output))
class pbs(Plugin): def __init__(self, logFileName): self.__log= Logger(logFileName) self.__failedPollTimes = 0 def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30): # copied from RunJobEdison cmd = 'showbf -p %s' % partition self.__log.info("Executing command: '%s'" % cmd) res_tuple = commands.getstatusoutput(cmd) self.__log.info("Executing command output: %s" % str(res_tuple)) showbf_str = "" if res_tuple[0] == 0: showbf_str = res_tuple[1] res = {} self.__log.info("Available resources in %s partition" % partition) self.__log.info(showbf_str) if showbf_str: shobf_out = showbf_str.splitlines() self.__log.info("Fitted resources") for l in shobf_out[2:]: d = l.split() nodes = int(d[2]) if nodes < int(min_nodes): continue if not d[3] == 'INFINITY': wal_time_arr = d[3].split(":") if len(wal_time_arr) < 4: wal_time_sec = int(wal_time_arr[0])*(60*60) + int(wal_time_arr[1])*60 + int(wal_time_arr[2]) if wal_time_sec > 24 * 3600: wal_time_sec = 24 * 3600 else: wal_time_sec = 24 * 3600 #if nodes > 1: # nodes = nodes - 1 else: wal_time_sec = 12 * 3600 # Fitting Hopper policy # https://www.nersc.gov/users/computational-systems/hopper/running-jobs/queues-and-policies/ nodes = max_nodes if nodes > max_nodes else nodes if nodes < 682 and wal_time_sec > 48 * 3600: wal_time_sec = 48 * 3600 elif nodes < 4096 and wal_time_sec > 36 * 3600: wal_time_sec = 36 * 3600 elif nodes < 5462 and wal_time_sec > 12 * 3600: wal_time_sec = 12 * 3600 elif wal_time_sec > 12 * 3600: wal_time_sec = 12 * 3600 if wal_time_sec < int(min_walltime_m) * 60: continue self.__log.info("Nodes: %s, Walltime (str): %s, Walltime (min) %s" % (nodes, d[3], wal_time_sec/60 )) res.update({nodes:wal_time_sec}) else: self.__log.info("No availble resources. Default values will be used.") self.__log.info("Get resources: %s" % res) return res def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None, dumpEventOutputs=False): submit_script = "#!/bin/bash -l" + "\n" submit_script += "#PBS -q " + queue + "\n" if repo: submit_script += "#PBS -A " + repo + "\n" submit_script += "#PBS -l mppwidth=" + str(mppwidth) + "\n" #submit_script += "#PBS -l mppnppn=" + str(mppnppn) + "\n" submit_script += "#PBS -l walltime=" + walltime + "\n" submit_script += "#PBS -N ES_job" + "\n" submit_script += "#PBS -j oe" + "\n" submit_script += "#PBS -o athena_stdout.txt" + "\n" submit_script += "#PBS -e athena_stderr.txt" + "\n" submit_script += "cd $PBS_O_WORKDIR" + "\n" submit_script += "module load mpi4py" + "\n" if localSetup: submit_script += localSetup + "\n" #submit_script += "source /project/projectdirs/atlas/sw/python-yampl/setup.sh" + "\n" submit_script += "export PYTHONPATH=/project/projectdirs/atlas/sw/python-yampl/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n" submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/boto/lib/python2.6/site-packages:$PYTHONPATH\n" submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/external:$PYTHONPATH\n" submit_script += "export LD_LIBRARY_PATH=/project/projectdirs/atlas/sw/python-yampl/yampl/1.0/lib:$LD_LIBRARY_PATH" + "\n" submit_script += "export X509_USER_PROXY=/global/homes/w/wguan/x509up_u23959" + "\n" submit_script += "export X509_CERT_DIR=/project/projectdirs/atlas/pilot/grid_env/external/grid-security/certificates" + "\n" submit_script += "env" + "\n" #submit_script += "aprun -n " + str(nodes) + " -N " + str(mppnppn) + " -d " + str(ATHENA_PROC_NUMBER) + " -cc none python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalWorkingDir+" --localWorkingDir="+localWorkingDir+"" submit_script += "aprun -n " + str(nodes) + " -N " + str(mppnppn) + " -cc none python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+"" ###cmd = "mpiexec -n 2 python " + os.path.join(self.__globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+self.__globalWorkingDir+" --localWorkingDir="+self.__localWorkingDir+"&" self.__submit_file = os.path.join(globalYodaDir, 'submit_script') handle = open(self.__submit_file, 'w') handle.write(submit_script) handle.close() self.__log.info("submit script:\n%s" % submit_script) cmd = "qsub " + self.__submit_file self.__log.info("submitting HPC job: %s" % cmd) status, output = commands.getstatusoutput(cmd) self.__log.info("submitting HPC job: (status: %s, output: %s)" %(status, output)) self.__jobid = None if status == 0: self.__jobid = output.replace("\n", "").split(".")[0] return 0, self.__jobid return -1, None def poll(self, jobid): # poll the job in HPC. update it cmd = "qstat " + jobid self.__log.info("polling HPC job: %s" % cmd) status, output = commands.getstatusoutput(cmd) #self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output)) if status == 0: self.__failedPollTimes = 0 state = None lines = output.split("\n") for line in lines: line = line.strip() if line.startswith(jobid): state = line.split(" ")[-2] if state == "C": self.__log.info("HPC job complete") return "Complete" if state == "R": return "Running" if state == "Q": return "Queue" else: self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output)) if 'Unknown Job Id Error' in output: self.__log.info("Unknown Job Id. Set Job Complete.") return "Complete" else: self.__failedPollTimes += 1 if self.__failedPollTimes > 5: return "Failed" else: return 'Unknown' def delete(self, jobid): command = "qdel " + jobid status, output = commands.getstatusoutput(command) self.__log.debug("Run Command: %s " % command) self.__log.debug("Status: %s, Output: %s" % (status, output))
class pbs(Plugin): def __init__(self, logFileName): self.__log = Logger(logFileName) self.__failedPollTimes = 0 def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30): # copied from RunJobEdison cmd = 'showbf -p %s' % partition self.__log.info("Executing command: '%s'" % cmd) res_tuple = commands.getstatusoutput(cmd) self.__log.info("Executing command output: %s" % str(res_tuple)) showbf_str = "" if res_tuple[0] == 0: showbf_str = res_tuple[1] res = {} self.__log.info("Available resources in %s partition" % partition) self.__log.info(showbf_str) if showbf_str: shobf_out = showbf_str.splitlines() self.__log.info("Fitted resources") for l in shobf_out[2:]: d = l.split() nodes = int(d[2]) if nodes < int(min_nodes): continue if not d[3] == 'INFINITY': wal_time_arr = d[3].split(":") if len(wal_time_arr) < 4: wal_time_sec = int(wal_time_arr[0]) * (60 * 60) + int( wal_time_arr[1]) * 60 + int(wal_time_arr[2]) if wal_time_sec > 24 * 3600: wal_time_sec = 24 * 3600 else: wal_time_sec = 24 * 3600 #if nodes > 1: # nodes = nodes - 1 else: wal_time_sec = 12 * 3600 # Fitting Hopper policy # https://www.nersc.gov/users/computational-systems/hopper/running-jobs/queues-and-policies/ nodes = max_nodes if nodes > max_nodes else nodes if nodes < 682 and wal_time_sec > 48 * 3600: wal_time_sec = 48 * 3600 elif nodes < 4096 and wal_time_sec > 36 * 3600: wal_time_sec = 36 * 3600 elif nodes < 5462 and wal_time_sec > 12 * 3600: wal_time_sec = 12 * 3600 elif wal_time_sec > 12 * 3600: wal_time_sec = 12 * 3600 if wal_time_sec < int(min_walltime_m) * 60: continue self.__log.info( "Nodes: %s, Walltime (str): %s, Walltime (min) %s" % (nodes, d[3], wal_time_sec / 60)) res.update({nodes: wal_time_sec}) else: self.__log.info( "No availble resources. Default values will be used.") self.__log.info("Get resources: %s" % res) return res def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None, dumpEventOutputs=False): submit_script = "#!/bin/bash -l" + "\n" submit_script += "#PBS -q " + queue + "\n" if repo: submit_script += "#PBS -A " + repo + "\n" submit_script += "#PBS -l mppwidth=" + str(mppwidth) + "\n" #submit_script += "#PBS -l mppnppn=" + str(mppnppn) + "\n" submit_script += "#PBS -l walltime=" + walltime + "\n" submit_script += "#PBS -N ES_job" + "\n" submit_script += "#PBS -j oe" + "\n" submit_script += "#PBS -o athena_stdout.txt" + "\n" submit_script += "#PBS -e athena_stderr.txt" + "\n" submit_script += "cd $PBS_O_WORKDIR" + "\n" submit_script += "module load mpi4py" + "\n" if localSetup: submit_script += localSetup + "\n" #submit_script += "source /project/projectdirs/atlas/sw/python-yampl/setup.sh" + "\n" submit_script += "export PYTHONPATH=/project/projectdirs/atlas/sw/python-yampl/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n" submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/boto/lib/python2.6/site-packages:$PYTHONPATH\n" submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/external:$PYTHONPATH\n" submit_script += "export LD_LIBRARY_PATH=/project/projectdirs/atlas/sw/python-yampl/yampl/1.0/lib:$LD_LIBRARY_PATH" + "\n" submit_script += "export X509_USER_PROXY=/global/homes/w/wguan/x509up_u23959" + "\n" submit_script += "export X509_CERT_DIR=/project/projectdirs/atlas/pilot/grid_env/external/grid-security/certificates" + "\n" submit_script += "env" + "\n" #submit_script += "aprun -n " + str(nodes) + " -N " + str(mppnppn) + " -d " + str(ATHENA_PROC_NUMBER) + " -cc none python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalWorkingDir+" --localWorkingDir="+localWorkingDir+"" submit_script += "aprun -n " + str( nodes ) + " -N " + str(mppnppn) + " -cc none python-mpi " + os.path.join( globalWorkingDir, "HPC/HPCJob.py" ) + " --globalWorkingDir=" + globalYodaDir + " --localWorkingDir=" + localWorkingDir + "" ###cmd = "mpiexec -n 2 python " + os.path.join(self.__globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+self.__globalWorkingDir+" --localWorkingDir="+self.__localWorkingDir+"&" self.__submit_file = os.path.join(globalYodaDir, 'submit_script') handle = open(self.__submit_file, 'w') handle.write(submit_script) handle.close() self.__log.info("submit script:\n%s" % submit_script) cmd = "qsub " + self.__submit_file self.__log.info("submitting HPC job: %s" % cmd) status, output = commands.getstatusoutput(cmd) self.__log.info("submitting HPC job: (status: %s, output: %s)" % (status, output)) self.__jobid = None if status == 0: self.__jobid = output.replace("\n", "").split(".")[0] return 0, self.__jobid return -1, None def poll(self, jobid): # poll the job in HPC. update it cmd = "qstat " + jobid self.__log.info("polling HPC job: %s" % cmd) status, output = commands.getstatusoutput(cmd) #self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output)) if status == 0: self.__failedPollTimes = 0 state = None lines = output.split("\n") for line in lines: line = line.strip() if line.startswith(jobid): state = line.split(" ")[-2] if state == "C": self.__log.info("HPC job complete") return "Complete" if state == "R": return "Running" if state == "Q": return "Queue" else: self.__log.info("polling HPC job: (status: %s, output: %s)" % (status, output)) if 'Unknown Job Id Error' in output: self.__log.info("Unknown Job Id. Set Job Complete.") return "Complete" else: self.__failedPollTimes += 1 if self.__failedPollTimes > 5: return "Failed" else: return 'Unknown' def delete(self, jobid): command = "qdel " + jobid status, output = commands.getstatusoutput(command) self.__log.debug("Run Command: %s " % command) self.__log.debug("Status: %s, Output: %s" % (status, output))
class poe(Plugin): def __init__(self, logFileName): self.__log = Logger(logFileName) self.__failedPollTimes = 0 def isLocalProcess(self): return True def getName(self): return 'poe' def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30): return None def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None, dumpEventOutputs=False): submit_script = "#!/bin/bash -l" + "\n" submit_script += "#@ tasks_per_node = 1" + "\n" submit_script += "source /etc/profile.d/modules.sh" + "\n" submit_script += "module load mpi4py" + "\n" if localSetup: submit_script += localSetup + "\n" submit_script += "source ${VO_ATLAS_SW_DIR}/local/setup-yampl.sh" + "\n" submit_script += "export PYTHONPATH=/cvmfs/atlas.cern.ch/repo/sw/local/noarch/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n" submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir #submit_script += "export CMTEXTRATAGS=ATLAS,useDBRelease" + "\n" submit_script += "env" + "\n" # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt" # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir # submit_script += "mpirun --host "+nodelist+" python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>" + globalYodaDir+ "/yoda_stdout.txt 2>" + globalYodaDir+ "/yoda_stderr.txt" submit_script += "poe parrot_run python-mpi " + os.path.join( globalWorkingDir, "HPC/HPCJob.py" ) + " --globalWorkingDir=" + globalYodaDir + " --localWorkingDir=" + localWorkingDir + " --outputDir=" + os.path.dirname( globalYodaDir) self.__log.debug("POE submit script: %s" % submit_script) # hpcJob = subprocess.Popen(submit_script, stdout=sys.stdout, stderr=sys.stdout, shell=True) yoda_stdout = open(os.path.join(globalYodaDir, 'yoda_stdout.txt'), 'a') yoda_stderr = open(os.path.join(globalYodaDir, 'yoda_stderr.txt'), 'a') hpcJob = subprocess.Popen(submit_script, stdout=yoda_stdout, stderr=yoda_stderr, shell=True) t1 = time.time() i = 20 while (hpcJob and hpcJob.poll() is None): if i == 0: self.__log.debug("Yoda process is running") i = 20 time.sleep(30) i -= 1 self.__log.debug("Yoda process terminated") self.__log.debug("Yoda process return code: %s" % hpcJob.returncode) return 0, None def poll(self, jobid): return None
class mpi(Plugin): def __init__(self, logFileName): self.__log = Logger(logFileName) self.__failedPollTimes = 0 def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30): return None def convertNodeList(self, nodelist): try: if '[' in nodelist: numNames = [] tmp = nodelist preName, numList = tmp.split('[') numList, postName = numList.split(']') for items in numList.split(","): if not '-' in items: numNames.append(preName + items + postName) else: start, end = items.split('-') numLen = len(start) for i in range(int(start), int(end) + 1): num = str(i).zfill(numLen) numNames.append(preName + str(num) + postName) return ','.join(numNames) else: return nodelist except: self.__log.debug(traceback.format_exc()) return nodelist def submitJob(self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None, dumpEventOutputs=False): nodelist = "" if os.environ.has_key('SLURM_NODELIST'): nodelist = os.environ['SLURM_NODELIST'] elif os.environ.has_key('PBS_NODELIST'): nodelist = os.environ['PBS_NODELIST'] nodelist = self.convertNodeList(nodelist) submit_script = "#!/bin/bash -l" + "\n" submit_script += "module load mpi4py openmpi-ccm" + "\n" if localSetup: submit_script += localSetup + "\n" submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir submit_script += "env" + "\n" # submit_script += "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt" submit_script += "srun -N " + str( nodes ) + " python-mpi " + os.path.join( globalWorkingDir, "HPC/HPCJob.py" ) + " --globalWorkingDir=" + globalYodaDir + " --localWorkingDir=" + localWorkingDir # submit_script += "mpirun --host "+nodelist+" python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir # submit_script += "mpirun -bynode python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>yoda_stdout.txt 2>yoda_stderr.txt" # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+" 1>" + globalYodaDir+ "/yoda_stdout.txt 2>" + globalYodaDir+ "/yoda_stderr.txt" # submit_script += "python " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+"" self.__log.debug("ARC submit script: %s" % submit_script) # hpcJob = subprocess.Popen(submit_script, stdout=sys.stdout, stderr=sys.stdout, shell=True) yoda_stdout = open(os.path.join(globalYodaDir, 'yoda_stdout.txt'), 'a') yoda_stderr = open(os.path.join(globalYodaDir, 'yoda_stderr.txt'), 'a') hpcJob = subprocess.Popen(submit_script, stdout=yoda_stdout, stderr=yoda_stderr, shell=True) while (hpcJob and hpcJob.poll() is None): self.__log.debug("Yoda process is running%s") time.sleep(30) self.__log.debug("Yoda process terminated") self.__log.debug("Yoda process return code: %s" % hpcJob.returncode) return 0, None def poll(self, jobid): return None
class slurm(Plugin): def __init__(self, logFileName): self.__log = Logger(logFileName) self.__failedPollTimes = 0 def getHPCResources(self, partition, max_nodes=None, min_nodes=2, min_walltime_m=30): # copied from RunJobEdison # cmd = 'showbf -p %s' % partition cmd = "sinfo " self.__log.info("Executing command: '%s'" % cmd) res_tuple = runcommand(cmd) self.__log.info("Executing command output: %s" % str(res_tuple)) showbf_str = "" if res_tuple[0] == 0: showbf_str = res_tuple[1] res = {} self.__log.info("Available resources in %s partition" % partition) self.__log.info(showbf_str) if showbf_str: shobf_out = showbf_str.splitlines() self.__log.info("Fitted resources") for l in shobf_out[2:]: d = l.split() nodes = int(d[2]) if nodes < int(min_nodes): continue if not d[3] == "INFINITY": wal_time_arr = d[3].split(":") if len(wal_time_arr) < 4: wal_time_sec = ( int(wal_time_arr[0]) * (60 * 60) + int(wal_time_arr[1]) * 60 + int(wal_time_arr[2]) ) if wal_time_sec > 24 * 3600: wal_time_sec = 24 * 3600 else: wal_time_sec = 24 * 3600 # if nodes > 1: # nodes = nodes - 1 else: wal_time_sec = 12 * 3600 # Fitting Hopper policy # https://www.nersc.gov/users/computational-systems/hopper/running-jobs/queues-and-policies/ nodes = max_nodes if nodes > max_nodes else nodes if nodes < 682 and wal_time_sec > 48 * 3600: wal_time_sec = 48 * 3600 elif nodes < 4096 and wal_time_sec > 36 * 3600: wal_time_sec = 36 * 3600 elif nodes < 5462 and wal_time_sec > 12 * 3600: wal_time_sec = 12 * 3600 elif wal_time_sec > 12 * 3600: wal_time_sec = 12 * 3600 if wal_time_sec < int(min_walltime_m) * 60: continue self.__log.info("Nodes: %s, Walltime (str): %s, Walltime (min) %s" % (nodes, d[3], wal_time_sec / 60)) res.update({nodes: wal_time_sec}) else: self.__log.info("No availble resources. Default values will be used.") self.__log.info("Get resources: %s" % res) return res def submitJob( self, globalWorkingDir, globalYodaDir, localWorkingDir, queue, repo, mppwidth, mppnppn, walltime, nodes, localSetup=None, cpuPerNode=None, dumpEventOutputs=False, ): submit_script = "#!/bin/bash -l" + "\n" if queue == "premium": submit_script += "#SBATCH -p regular\n" submit_script += "#SBATCH --qos=premium\n" elif queue == "scavenger": submit_script += "#SBATCH -p regular\n" submit_script += "#SBATCH --qos=scavenger\n" elif queue == "low": submit_script += "#SBATCH -p regular\n" submit_script += "#SBATCH --qos=low\n" else: submit_script += "#SBATCH -p " + queue + "\n" if repo: submit_script += "#SBATCH -A " + repo + "\n" # submit_script += "#SBATCH -n " + str(mppwidth) + "\n" submit_script += "#SBATCH -N " + str(nodes) + "\n" submit_script += "#SBATCH --signal=SIGUSR1@60\n" submit_script += "#SBATCH -t " + walltime + "\n" submit_script += "#SBATCH --ntasks-per-node=1\n" submit_script += "#SBATCH --cpus-per-task=" + str(cpuPerNode) + "\n" submit_script += "#SBATCH -J ES_job" + "\n" submit_script += "#SBATCH -o athena_stdout.txt" + "\n" submit_script += "#SBATCH -e athena_stderr.txt" + "\n" submit_script += "cd $SBATCH_O_WORKDIR" + "\n" submit_script += "module load mpi4py" + "\n" if localSetup: submit_script += localSetup + "\n" # submit_script += "source /project/projectdirs/atlas/sw/python-yampl/setup.sh" + "\n" # submit_script += "export PYTHONPATH=/project/projectdirs/atlas/sw/python-yampl/python-yampl/1.0/lib.linux-x86_64-2.6:$PYTHONPATH" + "\n" submit_script += "export PYTHONPATH=%s:$PYTHONPATH\n" % globalWorkingDir # submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/boto/lib/python2.6/site-packages:$PYTHONPATH\n" # submit_script += "export PYTHONPATH=/project/projectdirs/atlas/pilot/grid_env/external:$PYTHONPATH\n" # submit_script += "export LD_LIBRARY_PATH=/project/projectdirs/atlas/sw/python-yampl/yampl/1.0/lib:$LD_LIBRARY_PATH" + "\n" # submit_script += "export X509_USER_PROXY=/global/homes/w/wguan/x509up_u23959" + "\n" # submit_script += "export X509_CERT_DIR=/project/projectdirs/atlas/pilot/grid_env/external/grid-security/certificates" + "\n" submit_script += "env" + "\n" # submit_script += "module avail" + "\n" # submit_script += "module list" + "\n" # submit_script += "srun -n " + str(nodes) + " -N " + str(mppnppn) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+globalYodaDir+" --localWorkingDir="+localWorkingDir+"" submit_script += ( "srun -N " + str(nodes) + " python-mpi " + os.path.join(globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir=" + globalYodaDir + " --localWorkingDir=" + localWorkingDir ) if dumpEventOutputs: submit_script += " --dumpEventOutputs" ###cmd = "mpiexec -n 2 python " + os.path.join(self.__globalWorkingDir, "HPC/HPCJob.py") + " --globalWorkingDir="+self.__globalWorkingDir+" --localWorkingDir="+self.__localWorkingDir+"&" self.__submit_file = os.path.join(globalYodaDir, "submit_script") handle = open(self.__submit_file, "w") handle.write(submit_script) handle.close() self.__log.info("submit script:\n%s" % submit_script) cmd = "sbatch " + self.__submit_file self.__log.info("submitting HPC job: %s" % cmd) status, output = runcommand(cmd) self.__log.info("submitting HPC job: (status: %s, output: %s)" % (status, output)) self.__jobid = None if status == 0: self.__jobid = output.replace("\n", "").split(" ")[-1] return 0, self.__jobid return -1, None def poll(self, jobid): # poll the job in HPC. update it cmd = "scontrol show job " + jobid self.__log.info("polling HPC job: %s" % cmd) status, output = runcommand(cmd) # self.__log.info("polling HPC job: (status: %s, output: %s)" %(status, output)) if status == 0: self.__failedPollTimes = 0 state = None lines = output.split("\n") for line in lines: line = line.strip() if line.startswith("JobState"): state = line.split(" ")[0].split("=")[1] if state == "COMPLETED": self.__log.info("HPC job complete") return "Complete" if state == "RUNNING": self.__log.info("HPC job is running") return "Running" if state == "PENDING": self.__log.info("HPC job is pending") return "Queue" if state == "FAILED": self.__log.info("HPC job is failed") return "Failed" if state == "CANCELLED": self.__log.info("HPC job is cancelled") return "Failed" if state == "TIMEOUT": self.__log.info("HPC job is timed out") return "Failed" self.__log.info("HPC job is in unknown state") return "Unknown" else: self.__log.info("polling HPC job: (status: %s, output: %s)" % (status, output)) if "Invalid job id specified" in output: self.__log.info("Unknown Job Id. Set Job Complete.") return "Complete" else: self.__failedPollTimes += 1 self.__log.error( "Failing HPC job because the polling command has failed " + str(self.__failedPollTimes) + " times." ) return "Unknown" return "Unknown" def delete(self, jobid): command = "scancel " + jobid status, output = runcommand(command) self.__log.debug("Run Command: %s " % command) self.__log.debug("Status: %s, Output: %s" % (status, output))