def setFilePermissions(file, groupID): try: os.chown(file, -1, groupID) os.chmod(file, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP | stat.S_IWGRP) except: log('Warning - failed to set permissions on file ' + file, echo=True)
def quit(self): # "Final cleanup if any." log("dawnHTCKull.py --- quit() -- final cleanup", echo=True) if self.forkSubprocess: import signal import time os.kill(self.forkSubprocess.pid, signal.SIGTERM)
def noteEnd(self, test): """A test has finished running. """ if debug(): log("Finished %s, now running %d tests" % \ (test.name, self.numberTestsRunning), echo=True) self.numberTestsRunning -= 1
def load(self, testlist): """Receive a list of tests to possibly run." Assumes that status is already not CREATED if test could never run. """ log("Start submitting batch jobs ........ Note, only a max of %s jobs will be submitted at a time. " % (self.maxBatchAllowed), echo=True) self.testlist = testlist self.running = [] self.numberTestsRunning = 0 for t in testlist: t.batchDic = {} t.batchDic['depends_on'] = None t.submitted = False t.batchstatus = "UNKNOWN" for d in t.dependents: d.batchDic = {} d.batchDic['depends_on'] = None d.submitted = False d.batchstatus = "UNKNOWN" self.run() return len(self.testlist)
def forkServerSetup (self): import tempfile if 'FORKSERVERDIR' not in os.environ: forkCommand= "/usr/gapps/coop/forkserver/bin/forkserver.py" try: outHandle= tempfile.NamedTemporaryFile('w') errHandle= tempfile.NamedTemporaryFile('w') self.forkSubprocess= subprocess.Popen(forkCommand, shell=True, stdout=outHandle, stderr=errHandle) except OSError as e: #print "error in running forkserver .." print("Error in running the forkserver. ", sys.exc_info()[0]) outHandle.close() errHandle.close() import time time.sleep(2) # give the forkserver a sec to write out the directory name try: newfile= open(outHandle.name, 'r') line1= newfile.readlines() os.environ['FORKSERVERDIR']= line1[0].strip() log("Note: setting FORKSERVER env to be %s" % (line1[0]), echo=True) except: log("Note: setting FORKSERVER env to be .", echo=True)
def kill(self, test): "Final cleanup if any." for killTimes in range(0,1): if self.lastSqueueResult is None or ( (time.time() - self.lastTimeSqueueCalled) > 60): # in seconds self.lastSqueueResult= utils.getAllSlurmStepIds() self.lastTimeSqueueCalled= time.time() # set time #if debug(): # log("---- LCMachineCore::kill(), stepIdLines %s ----\n" % (self.lastSqueueResult) ) killAttempted= False for line in self.lastSqueueResult: if test.jobname in line: scancelCommand= 'scancel ' + line.split()[0] if debug(): log("---- LCMachineCore::kill: %s" % (scancelCommand), echo=True) #log("---- LCMachineCore::kill, test name: %s using: %s" % (test.jobname, scancelCommand), echo=True) #log("---- LCMachineCore::kill, line: %s" % (line), echo=True) utils.runThisCommand(scancelCommand) #time.sleep(2) killAttempted= True break if not killAttempted: break if debug(): log("---- LCMachineCore::kill, CALLED AGAIN %s test name: %s %s" % ((killTimes+1), test.jobname, test.serialNumber), echo=True) time.sleep(1) self.lastSqueueResult= None time.sleep(2)
def setDirectoryPermissions(dir, groupID): try: os.chown(dir, -1, groupID) os.chmod(dir, stat.S_IRWXU | stat.S_IRWXG | stat.S_ISUID | stat.S_ISGID) except: log('Warning - failed to set permissions on directory ' + dir, echo=True)
def listDatedDirs(folder): try: dir_list = [d for d in os.listdir(folder) \ if re.search('2[0-9][0-9][0-9]_[0-9][0-9]$', d) \ if os.path.isdir(os.path.join(folder, d))] except OSError as error: log("WARNING - listDatedDirs: %s" % error.strerror, echo=True) dir_list = [] return dir_list
def init(self): # Identify the slurm version so ATS may account for differences # in slurm behavior tstr = subprocess.check_output(['srun', '--version'], text=True) tarray = tstr.split() SlurmProcessorScheduled.slurm_version_str = tarray[1] log('SLURM VERSION STRING', SlurmProcessorScheduled.slurm_version_str) tarray = SlurmProcessorScheduled.slurm_version_str.split('.') SlurmProcessorScheduled.slurm_version_int = (int(tarray[0]) * 1000) + ( int(tarray[1]) * 100) + (int(tarray[2])) log('SLURM VERSION NUMBER', SlurmProcessorScheduled.slurm_version_int) self.runningWithinSalloc = True if "SLURM_JOB_NUM_NODES" in os.environ.keys(): self.numNodes = int(os.getenv("SLURM_JOB_NUM_NODES")) self.npMax = int( os.getenv("SLURM_JOB_CPUS_PER_NODE", "1").split("(")[0]) elif "SLURM_NNODES" in os.environ.keys(): self.numNodes = int(os.getenv("SLURM_NNODES")) self.npMax = int( os.getenv("SLURM_JOB_CPUS_PER_NODE", "1").split("(")[0]) else: self.runningWithinSalloc = False self.npMax = self.numberTestsRunningMax # Set cores on alastor to 20 if "HOSTNAME" in os.environ.keys(): self.hostname = os.getenv("HOSTNAME") if self.hostname.startswith('rzalastor'): print("Setting npMax to 20 on alastor") self.npMax = 20 self.npMaxH = 20 # Does slurm see the ATS process itself as utilizing a core? self.slurmSeesATSProcessAsUsingACore = False if "SLURM_PTY_PORT" in os.environ.keys( ) or "SLURM_STEP_ID" in os.environ.keys(): self.slurmSeesATSProcessAsUsingACore = True print(""" ATS NOTICE: Slurm sees ATS or Shell as itself using a CPU. ATS Will ignore 'nn' (number of nodes) test options and allow processes to span multiple nodes for better throughput and to help prevent srun hangs. NOTE: This feature may not fix possible hangs resulting from a single test case which utilizes all allocated cores. Slurm may not see all the cores as usable and accept the job but not schedule it, resulting in a hang The node spanning behavior may be overridden with the --strict_nn ATS option. CAUTION: Use of --strict_nn may result in slurm/srun hangs which are beyond the control of ATS, depending on how the nodes were allocated """) super(SlurmProcessorScheduled, self).init()
def listfiles(folder): try: file_list = [ d for d in os.listdir(folder) if os.path.isfile(os.path.join(folder, d)) ] except OSError as error: log("WARNING - listfiles: %s" % error.strerror, echo=True) file_list = [] return file_list
def listdirs(folder): try: dir_list = [ d for d in os.listdir(folder) if os.path.isdir(os.path.join(folder, d)) ] except OSError as error: log("WARNING - listdirs: %s" % error.strerror, echo=True) dir_list = [] return dir_list
def noteLaunch(self, test): """A test has been launched.""" self.npBusy += max(test.np, 1) self.allNodesUsed[test.nodename] += max(test.np, 1) if debug(): log("dawnCompile.py__ Max np= %d. Launched %s with np= %d tests, total proc used = %d" % \ (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True) self.numberTestsRunning = self.npBusy
def noteLaunch(self, test): """A test has been launched.""" if test.mpiNodesFilename is not None: for anode in test.mpiNodesList: self.mapNodeName_ProcsUsed[anode]= 1 else: self.mapNodeName_ProcsUsed[test.nodeToUse]= 1 if debug(): log("angrenSandia.py__ Max np= %d. Launched %s with np= %d tests, total proc used = %d" % \ (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True)
def makeDir(new_dir): if not os.path.exists(new_dir): try: os.mkdir(new_dir) except OSError as error: log('Error making %s: %s' % (new_dir, error.strerror), echo=True) raise SystemExit(1) elif not os.path.isdir(new_dir): log('ERROR: %s exists and is NOT a directory' % new_dir, echo=True) raise SystemExit(1)
def noteEnd(self, test): """A test has finished running. """ if test.mpiNodesFilename is not None: for anode in test.mpiNodesList: self.mapNodeName_ProcsUsed[anode]= 0 else: self.mapNodeName_ProcsUsed[test.nodeToUse]= 0 if debug(): log("Finished %s, #total proc used = %d" % (test.name, self.npBusy), echo=True) self.numberTestsRunning= self.npBusy
def kill(self, test): "Final cleanup if any." # kill the test # This is necessary -- killing the srun command itself is not enough to end the job... it is still running (squeue will show this) import subprocess if test.status is RUNNING or test.status is TIMEDOUT: try: retcode= subprocess.call("scancel" + " -n " + test.jobname, shell=True) if retcode < 0: log("---- kill() in chaosMulti.py, command= scancel -n %s failed with return code -%d ----" % (test.jobname, retcode), echo=True) except OSError as e: log("---- kill() in chaosMulti.py, execution of command failed (scancel -n %s) failed: %s----" % (test.jobname, e), echo=True)
def noteEnd(self, test): """A test has finished running. """ if not self.removeSrunStep: self.stepUsedDic= utils.removeFromUsedTotalDic(self.stepUsedDic, self.nodeStepNumDic, self.npMax, test.step, test.np, self.stepId, self.allNodeList) self.npBusy -= max(test.np, 1) else: self.npBusy -= max(test.np, test.numberOfNodesNeeded*self.npMax) # this is necessary when srun exclusive is used. if debug(): log("Finished %s, #total proc in use = %d" % (test.name, self.npBusy), echo=True) self.scheduler.schedule("Finished %s, #total proc in use = %d" % (test.name, self.npBusy)) self.numberTestsRunning= self.npBusy
def noteEnd(self, test): """A test has finished running. """ self.npBusy -= max(test.np, 1) self.allNodesUsed[test.nodename] -= max(test.np, 1) if debug(): log("Finished %s, #total proc used = %d" % \ (test.name, self.npBusy), echo=True) self.numberTestsRunning = self.npBusy # Add to combo log file self.catLogFiles(test)
def launch(self, test): """Start executable using a suitable command. Return True if able to do so. Call noteLaunch if launch succeeded.""" test.commandLine = self.calculateCommandLine(test) test.commandList = test.commandLine.split()[:] if debug() or configuration.options.skip: log.indent() log(test.commandLine, echo=True) log.dedent() if configuration.options.skip: test.set(atsut.SKIPPED, "--skip option") return False test.setStartDateTime() return self._launch(test)
def noteEnd(self, test): """A test has finished running. """ numberOfNodesNeeded, r = divmod(max(1, test.np), self.npMax) if r: numberOfNodesNeeded += 1 self.npBusy -= max( test.np, numberOfNodesNeeded * self.npMax) # this is necessary when srun exclusive is used. if debug(): log("Finished %s, #total proc used = %d" % (test.name, self.npBusy), echo=True) self.numberTestsRunning = self.npBusy
def noteLaunch(self, test): """A test has been launched.""" self.npBusy += max(test.np, 1) if not self.removeSrunStep: test.step = self.stepInUse utils.addToUsedTotalDic(self.stepUsedDic, self.nodeStepNumDic, self.npMax, self.stepInUse, test.np) if debug(): log("chaosCompile.py__ Max np= %d. Launched %s with np= %d tests, total proc used = %d" % \ (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True) self.numberTestsRunning = self.npBusy
def noteLaunch(self, test): """A test has been launched.""" if not self.removeSrunStep: utils.addToUsedTotalDic(self.stepUsedDic, self.nodeStepNumDic, self.npMax, self.stepInUse, test.np) self.npBusy += max(test.np, 1) else: self.npBusy += max(test.np, test.numberOfNodesNeeded*self.npMax) # this is necessary when srun exclusive is used. if debug(): log("Max np= %d. Launched %s with np= %d tests, total proc in use = %d" % \ (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True) self.scheduler.schedule("Max np= %d. Launched %s with np= %d tests, total proc in use = %d" % \ (self.numberMaxProcessors, test.name, test.np, self.npBusy)) self.numberTestsRunning= self.npBusy
def noteLaunch(self, test): """A test has been launched.""" numberOfNodesNeeded, r = divmod(max(test.np, 1), self.npMax) if r: numberOfNodesNeeded += 1 self.npBusy += max( test.np, numberOfNodesNeeded * self.npMax) # this is necessary when srun exclusive is used. if debug(): #log("cray.py__usedDic: %s" % \ # (self.stepUsedDic), echo=True) log("cray.py__ Max np= %d. Launched %s with np= %d tests, total proc used = %d" % \ (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True) self.numberTestsRunning = self.npBusy
def startRun(self, test): """For interactive test object, launch the test object. Return True if able to start the test. """ log('Batching #%d' % test.serialNumber, test.name, time.asctime(), echo=True) log.indent() if debug(): log('For test #%d' % (test.serialNumber), ' in test directory', test.directory, echo=True) log.dedent() return self.launch(test)
def noteEnd(self, test): """A test has finished running. """ self.npBusy -= max(test.np, 1) if not self.removeSrunStep: self.stepUsedDic = utils.removeFromUsedTotalDic( self.stepUsedDic, self.nodeStepNumDic, self.npMax, test.step, test.np, self.stepId, self.allNodeList) if debug(): log("Finished %s, #total proc used = %d" % \ (test.name, self.npBusy), echo=True) self.numberTestsRunning = self.npBusy # Add to combo log file self.catLogFiles(test)
def noteEnd(self, test): """A test has finished running. """ # noteEnd is called by machines.py (getStatus() -> testEnded() -> noteEnd() import math if test.useHTCNode: self.numHTCUsed -= test.nodes else: self.npBusy -= ( math.ceil(test.nodes / float(self.numNodesPerTest)) * self.numNodesPerTest * self.numCPUPerNode) self.numberTestsRunning = self.numHTCUsed + (self.npBusy / self.numNodesPerTest) if debug(): log("Finished %s, now running %d tests, #proc used = %d" % \ (test.name, self.numberTestsRunning, self.npBusy), echo=True)
def kill(self, test): "Final cleanup if any." # kill the test import subprocess if self.runningWithinBsub == False: if test.status is RUNNING or test.status is TIMEDOUT: try: print("ATS cancelling job: bkill -J " + test.jobname) retcode = subprocess.call("bkill -J " + test.jobname, shell=True) if retcode < 0: log("---- bkill() in lsf_asq.py, command= bkill -J %s failed with return code -%d ----" % (test.jobname, retcode), echo=True) except OSError as e: log("---- bkill() in lsf_asq.py, execution of command failed (bkill -J %s) failed: %s----" % (test.jobname, e), echo=True)
def runCommand(cmd_line, file_name=None, exit=True, verbose=False): """ Function to run a command and capture its output. """ popen_args = shlex.split(cmd_line) log('runCommand command line: %s' % cmd_line, echo=verbose) try: if file_name is not None: if os.path.exists(file_name): stdout_pipe = open(file_name, 'a') stderr_pipe = open('%s.err' % file_name, 'a') else: stdout_pipe = open(file_name, 'w') stderr_pipe = open('%s.err' % file_name, 'w') else: stdout_pipe = PIPE stderr_pipe = PIPE (stdout_txt, stderr_txt) = Popen(popen_args, stdout=stdout_pipe, stderr=stderr_pipe, text=True).communicate() if file_name is not None: stdout_pipe.close() stderr_pipe.close() except CalledProcessError as error: log('Command failed: error code %d' % error.returncode, echo=True) log('Failed Command: %s' % cmd_line, echo=True) if exit: raise SystemExit(1) except OSError as error: log('Command failed with OSError: traceback %s' % error.child_traceback, echo=True) log('Failed Command: %s' % cmd_line, echo=True) if exit: raise SystemExit(1) return (stdout_txt, stderr_txt)
def copyAndRenameFile(filename, newfilename, srcdir, destdir, groupID): srcfile = os.path.join(srcdir, filename) if os.path.isfile(srcfile): destfile = os.path.join(destdir, os.path.basename(newfilename)) shutil.copyfile(srcfile, destfile) try: os.chown(destfile, -1, groupID) os.chmod( destfile, stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP | stat.S_IWGRP) except OSError as error: log('WARNING - failed to set permissions on %s: %s' % (destfile, error.strerror), echo=True) return destfile else: log("WARNING - copyAndRenameFile: %s file does not exist in %s." % (filename, srcdir), echo=True)
def noteLaunch(self, test): """A test has been launched.""" # noteLaunch is called by machines.py (startRun() -> launch() -> noteLaunch() import math if test.useHTCNode: self.numHTCUsed += test.nodes else: self.npBusy += ( math.ceil(test.nodes / float(self.numNodesPerTest)) * self.numNodesPerTest * self.numCPUPerNode) self.numberTestsRunning = self.numHTCUsed + (self.npBusy / self.numNodesPerTest) if debug(): log("dawnHTC.py: Launched %s,\tnow running %d tests,\t#nodes used = %d,\t#htc used= %d" % \ (test.name, self.numberTestsRunning, self.npBusy, self.numHTCUsed), echo=True) self.periodicReport()