コード例 #1
0
def setFilePermissions(file, groupID):
    try:
        os.chown(file, -1, groupID)
        os.chmod(file,
                 stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP | stat.S_IWGRP)
    except:
        log('Warning - failed to set permissions on file ' + file, echo=True)
コード例 #2
0
 def quit(self): #
     "Final cleanup if any."
     log("dawnHTCKull.py --- quit() -- final cleanup", echo=True)
     if self.forkSubprocess:
         import signal
         import time
         os.kill(self.forkSubprocess.pid, signal.SIGTERM)
コード例 #3
0
ファイル: batchSingle.py プロジェクト: kurtsansom/ats
    def noteEnd(self, test):
        """A test has finished running. """
        if debug():
            log("Finished %s, now running %d tests" % \
                (test.name, self.numberTestsRunning), echo=True)

        self.numberTestsRunning -= 1
コード例 #4
0
ファイル: batchSingle.py プロジェクト: kurtsansom/ats
    def load(self, testlist):
        """Receive a list of tests to possibly run."
           Assumes that status is already not CREATED if test could never run.
        """
        log("Start submitting batch jobs ........ Note, only a max of %s jobs will be submitted at a time. "
            % (self.maxBatchAllowed),
            echo=True)

        self.testlist = testlist

        self.running = []
        self.numberTestsRunning = 0
        for t in testlist:
            t.batchDic = {}
            t.batchDic['depends_on'] = None
            t.submitted = False
            t.batchstatus = "UNKNOWN"
            for d in t.dependents:
                d.batchDic = {}
                d.batchDic['depends_on'] = None
                d.submitted = False
                d.batchstatus = "UNKNOWN"

        self.run()

        return len(self.testlist)
コード例 #5
0
    def forkServerSetup (self): 
        import tempfile
        if 'FORKSERVERDIR' not in os.environ:
            forkCommand= "/usr/gapps/coop/forkserver/bin/forkserver.py"
            try:
                outHandle= tempfile.NamedTemporaryFile('w')
                errHandle=  tempfile.NamedTemporaryFile('w')
                self.forkSubprocess= subprocess.Popen(forkCommand, shell=True, stdout=outHandle, stderr=errHandle)
            except OSError as e:
                #print "error in running forkserver .."
                print("Error in running the forkserver. ", sys.exc_info()[0])
                outHandle.close()
                errHandle.close()

            import time
            time.sleep(2)  # give the forkserver a sec to write out the directory name
   
            try:
                newfile= open(outHandle.name, 'r') 
                line1= newfile.readlines()
    
                os.environ['FORKSERVERDIR']= line1[0].strip()
                log("Note: setting FORKSERVER env to be %s" % (line1[0]), echo=True)
            except:
                log("Note: setting FORKSERVER env to be .", echo=True)
コード例 #6
0
    def kill(self, test):
        "Final cleanup if any."

        for killTimes in range(0,1):

            if self.lastSqueueResult is None or ( (time.time() - self.lastTimeSqueueCalled) > 60):   # in seconds
                self.lastSqueueResult= utils.getAllSlurmStepIds()
                self.lastTimeSqueueCalled= time.time()    # set time
                #if debug():
                #    log("---- LCMachineCore::kill(), stepIdLines  %s ----\n" %  (self.lastSqueueResult) )

            killAttempted= False
            for line in self.lastSqueueResult:
                if test.jobname in line:
                    scancelCommand= 'scancel ' + line.split()[0]
                    if debug():
                        log("---- LCMachineCore::kill: %s" %  (scancelCommand), echo=True)
                        #log("---- LCMachineCore::kill, test name: %s using: %s" %  (test.jobname, scancelCommand), echo=True)
                        #log("---- LCMachineCore::kill, line: %s" %  (line), echo=True)
                    utils.runThisCommand(scancelCommand)
                    #time.sleep(2)
                    killAttempted= True
                    break

            if not killAttempted:
                break

            if debug():
                log("---- LCMachineCore::kill, CALLED AGAIN %s test name: %s %s" %  ((killTimes+1), test.jobname, test.serialNumber), echo=True)
            time.sleep(1)

            self.lastSqueueResult= None
            time.sleep(2)
コード例 #7
0
def setDirectoryPermissions(dir, groupID):
    try:
        os.chown(dir, -1, groupID)
        os.chmod(dir,
                 stat.S_IRWXU | stat.S_IRWXG | stat.S_ISUID | stat.S_ISGID)
    except:
        log('Warning - failed to set permissions on directory ' + dir,
            echo=True)
コード例 #8
0
def listDatedDirs(folder):
    try:
        dir_list = [d for d in os.listdir(folder) \
                      if re.search('2[0-9][0-9][0-9]_[0-9][0-9]$', d) \
                      if os.path.isdir(os.path.join(folder, d))]
    except OSError as error:
        log("WARNING - listDatedDirs: %s" % error.strerror, echo=True)
        dir_list = []
    return dir_list
コード例 #9
0
    def init(self):

        # Identify the slurm version so ATS may account for differences
        # in slurm behavior
        tstr = subprocess.check_output(['srun', '--version'], text=True)
        tarray = tstr.split()
        SlurmProcessorScheduled.slurm_version_str = tarray[1]
        log('SLURM VERSION STRING', SlurmProcessorScheduled.slurm_version_str)
        tarray = SlurmProcessorScheduled.slurm_version_str.split('.')
        SlurmProcessorScheduled.slurm_version_int = (int(tarray[0]) * 1000) + (
            int(tarray[1]) * 100) + (int(tarray[2]))
        log('SLURM VERSION NUMBER', SlurmProcessorScheduled.slurm_version_int)

        self.runningWithinSalloc = True

        if "SLURM_JOB_NUM_NODES" in os.environ.keys():
            self.numNodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
            self.npMax = int(
                os.getenv("SLURM_JOB_CPUS_PER_NODE", "1").split("(")[0])
        elif "SLURM_NNODES" in os.environ.keys():
            self.numNodes = int(os.getenv("SLURM_NNODES"))
            self.npMax = int(
                os.getenv("SLURM_JOB_CPUS_PER_NODE", "1").split("(")[0])
        else:
            self.runningWithinSalloc = False
            self.npMax = self.numberTestsRunningMax

        # Set cores on alastor to 20
        if "HOSTNAME" in os.environ.keys():
            self.hostname = os.getenv("HOSTNAME")
            if self.hostname.startswith('rzalastor'):
                print("Setting npMax to 20 on alastor")
                self.npMax = 20
                self.npMaxH = 20

        # Does slurm see the ATS process itself as utilizing a core?
        self.slurmSeesATSProcessAsUsingACore = False
        if "SLURM_PTY_PORT" in os.environ.keys(
        ) or "SLURM_STEP_ID" in os.environ.keys():
            self.slurmSeesATSProcessAsUsingACore = True
            print("""
ATS NOTICE: Slurm sees ATS or Shell as itself using a CPU.
            ATS Will ignore 'nn' (number of nodes) test options and allow processes
            to span multiple nodes for better throughput and to help prevent srun hangs.

            NOTE: This feature may not fix possible hangs resulting from a single test
                  case which utilizes all allocated cores. Slurm may not see all 
                  the cores as usable and accept the job but not schedule it, resulting in a hang

            The node spanning behavior may be overridden with the --strict_nn ATS option.

            CAUTION: Use of --strict_nn may result in slurm/srun hangs which are 
                     beyond the control of ATS, depending on how the nodes were allocated
""")

        super(SlurmProcessorScheduled, self).init()
コード例 #10
0
def listfiles(folder):
    try:
        file_list = [
            d for d in os.listdir(folder)
            if os.path.isfile(os.path.join(folder, d))
        ]
    except OSError as error:
        log("WARNING - listfiles: %s" % error.strerror, echo=True)
        file_list = []
    return file_list
コード例 #11
0
def listdirs(folder):
    try:
        dir_list = [
            d for d in os.listdir(folder)
            if os.path.isdir(os.path.join(folder, d))
        ]
    except OSError as error:
        log("WARNING - listdirs: %s" % error.strerror, echo=True)
        dir_list = []
    return dir_list
コード例 #12
0
ファイル: dawnCompile.py プロジェクト: kurtsansom/ats
    def noteLaunch(self, test):
        """A test has been launched."""

        self.npBusy += max(test.np, 1)
        self.allNodesUsed[test.nodename] += max(test.np, 1)

        if debug():
            log("dawnCompile.py__ Max np= %d. Launched %s with np= %d tests, total proc used = %d" % \
                (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True)

        self.numberTestsRunning = self.npBusy
コード例 #13
0
    def noteLaunch(self, test):
        """A test has been launched."""

        if test.mpiNodesFilename is not None:
            for anode in test.mpiNodesList:
                self.mapNodeName_ProcsUsed[anode]= 1
        else:
            self.mapNodeName_ProcsUsed[test.nodeToUse]= 1
            
        if debug():
            log("angrenSandia.py__ Max np= %d. Launched %s with np= %d tests, total proc used = %d" % \
                (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True)
コード例 #14
0
def makeDir(new_dir):
    if not os.path.exists(new_dir):
        try:
            os.mkdir(new_dir)

        except OSError as error:
            log('Error making %s: %s' % (new_dir, error.strerror), echo=True)
            raise SystemExit(1)

    elif not os.path.isdir(new_dir):
        log('ERROR: %s exists and is NOT a directory' % new_dir, echo=True)
        raise SystemExit(1)
コード例 #15
0
    def noteEnd(self, test):
        """A test has finished running. """

        if test.mpiNodesFilename is not None:
            for anode in test.mpiNodesList:
                self.mapNodeName_ProcsUsed[anode]= 0
        else:
            self.mapNodeName_ProcsUsed[test.nodeToUse]= 0

        if debug():
            log("Finished %s, #total proc used = %d" %  (test.name, self.npBusy), echo=True)

        self.numberTestsRunning= self.npBusy
コード例 #16
0
ファイル: chaosMulti.py プロジェクト: kurtsansom/ats
 def kill(self, test): 
     "Final cleanup if any."
     # kill the test
     # This is necessary -- killing the srun command itself is not enough to end the job... it is still running (squeue will show this)
     import subprocess
     
     if test.status is RUNNING or test.status is TIMEDOUT:
         try:
             retcode= subprocess.call("scancel" + " -n  " + test.jobname, shell=True)
             if retcode < 0:
                 log("---- kill() in chaosMulti.py, command= scancel -n  %s failed with return code -%d  ----" %  (test.jobname, retcode), echo=True)
         except OSError as e:
             log("---- kill() in chaosMulti.py, execution of command failed (scancel -n  %s) failed:  %s----" %  (test.jobname, e), echo=True)
コード例 #17
0
ファイル: chaosMulti.py プロジェクト: kurtsansom/ats
    def noteEnd(self, test):
        """A test has finished running. """

        if not self.removeSrunStep:
            self.stepUsedDic= utils.removeFromUsedTotalDic(self.stepUsedDic, self.nodeStepNumDic, self.npMax, test.step, test.np, self.stepId, self.allNodeList)
            self.npBusy -= max(test.np, 1)
        else:
            self.npBusy -= max(test.np, test.numberOfNodesNeeded*self.npMax)     # this is necessary when srun exclusive is used.

        if debug():
            log("Finished %s, #total proc in use = %d" %  (test.name, self.npBusy), echo=True)
            self.scheduler.schedule("Finished %s, #total proc in use = %d" %  (test.name, self.npBusy))

        self.numberTestsRunning= self.npBusy
コード例 #18
0
ファイル: dawnCompile.py プロジェクト: kurtsansom/ats
    def noteEnd(self, test):
        """A test has finished running. """

        self.npBusy -= max(test.np, 1)
        self.allNodesUsed[test.nodename] -= max(test.np, 1)

        if debug():
            log("Finished %s, #total proc used = %d" % \
                (test.name, self.npBusy), echo=True)

        self.numberTestsRunning = self.npBusy

        # Add to combo log file
        self.catLogFiles(test)
コード例 #19
0
ファイル: batchSingle.py プロジェクト: kurtsansom/ats
    def launch(self, test):
        """Start executable using a suitable command. Return True if able to do so.
           Call noteLaunch if launch succeeded."""
        test.commandLine = self.calculateCommandLine(test)
        test.commandList = test.commandLine.split()[:]

        if debug() or configuration.options.skip:
            log.indent()
            log(test.commandLine, echo=True)
            log.dedent()
        if configuration.options.skip:
            test.set(atsut.SKIPPED, "--skip option")
            return False
        test.setStartDateTime()
        return self._launch(test)
コード例 #20
0
ファイル: crayMulti.py プロジェクト: kurtsansom/ats
    def noteEnd(self, test):
        """A test has finished running. """

        numberOfNodesNeeded, r = divmod(max(1, test.np), self.npMax)
        if r: numberOfNodesNeeded += 1
        self.npBusy -= max(
            test.np, numberOfNodesNeeded *
            self.npMax)  # this is necessary when srun exclusive is used.

        if debug():
            log("Finished %s, #total proc used = %d" %
                (test.name, self.npBusy),
                echo=True)

        self.numberTestsRunning = self.npBusy
コード例 #21
0
ファイル: chaosCompile.py プロジェクト: kurtsansom/ats
    def noteLaunch(self, test):
        """A test has been launched."""

        self.npBusy += max(test.np, 1)

        if not self.removeSrunStep:
            test.step = self.stepInUse

            utils.addToUsedTotalDic(self.stepUsedDic, self.nodeStepNumDic,
                                    self.npMax, self.stepInUse, test.np)

        if debug():
            log("chaosCompile.py__ Max np= %d. Launched %s with np= %d tests, total proc used = %d" % \
                (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True)

        self.numberTestsRunning = self.npBusy
コード例 #22
0
ファイル: chaosMulti.py プロジェクト: kurtsansom/ats
    def noteLaunch(self, test):
        """A test has been launched."""

        if not self.removeSrunStep:
            utils.addToUsedTotalDic(self.stepUsedDic, self.nodeStepNumDic, self.npMax, self.stepInUse, test.np)
            self.npBusy += max(test.np, 1)
        else:
            self.npBusy += max(test.np, test.numberOfNodesNeeded*self.npMax)     # this is necessary when srun exclusive is used.

        if debug():
            log("Max np= %d. Launched %s with np= %d tests, total proc in use = %d" % \
                (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True)
            self.scheduler.schedule("Max np= %d. Launched %s with np= %d tests, total proc in use = %d" % \
                (self.numberMaxProcessors, test.name, test.np, self.npBusy))
          
        self.numberTestsRunning= self.npBusy
コード例 #23
0
ファイル: crayMulti.py プロジェクト: kurtsansom/ats
    def noteLaunch(self, test):
        """A test has been launched."""

        numberOfNodesNeeded, r = divmod(max(test.np, 1), self.npMax)
        if r: numberOfNodesNeeded += 1
        self.npBusy += max(
            test.np, numberOfNodesNeeded *
            self.npMax)  # this is necessary when srun exclusive is used.

        if debug():
            #log("cray.py__usedDic: %s" % \
            #    (self.stepUsedDic), echo=True)
            log("cray.py__ Max np= %d. Launched %s with np= %d tests, total proc used = %d" % \
                (self.numberMaxProcessors, test.name, test.np, self.npBusy), echo=True)

        self.numberTestsRunning = self.npBusy
コード例 #24
0
ファイル: batchSingle.py プロジェクト: kurtsansom/ats
 def startRun(self, test):
     """For interactive test object, launch the test object.
        Return True if able to start the test.
     """
     log('Batching #%d' % test.serialNumber,
         test.name,
         time.asctime(),
         echo=True)
     log.indent()
     if debug():
         log('For test #%d' % (test.serialNumber),
             ' in test directory',
             test.directory,
             echo=True)
     log.dedent()
     return self.launch(test)
コード例 #25
0
ファイル: chaosCompile.py プロジェクト: kurtsansom/ats
    def noteEnd(self, test):
        """A test has finished running. """

        self.npBusy -= max(test.np, 1)
        if not self.removeSrunStep:

            self.stepUsedDic = utils.removeFromUsedTotalDic(
                self.stepUsedDic, self.nodeStepNumDic, self.npMax, test.step,
                test.np, self.stepId, self.allNodeList)

        if debug():
            log("Finished %s, #total proc used = %d" % \
                (test.name, self.npBusy), echo=True)

        self.numberTestsRunning = self.npBusy

        # Add to combo log file
        self.catLogFiles(test)
コード例 #26
0
    def noteEnd(self, test):
        """A test has finished running. """
        # noteEnd is called by machines.py (getStatus() -> testEnded() -> noteEnd()

        import math
        if test.useHTCNode:
            self.numHTCUsed -= test.nodes
        else:
            self.npBusy -= (
                math.ceil(test.nodes / float(self.numNodesPerTest)) *
                self.numNodesPerTest * self.numCPUPerNode)

        self.numberTestsRunning = self.numHTCUsed + (self.npBusy /
                                                     self.numNodesPerTest)

        if debug():
            log("Finished %s, now running %d tests, #proc used = %d" % \
                (test.name, self.numberTestsRunning, self.npBusy), echo=True)
コード例 #27
0
 def kill(self, test):
     "Final cleanup if any."
     # kill the test
     import subprocess
     if self.runningWithinBsub == False:
         if test.status is RUNNING or test.status is TIMEDOUT:
             try:
                 print("ATS cancelling job: bkill -J " + test.jobname)
                 retcode = subprocess.call("bkill -J " + test.jobname,
                                           shell=True)
                 if retcode < 0:
                     log("---- bkill() in lsf_asq.py, command= bkill -J %s failed with return code -%d  ----"
                         % (test.jobname, retcode),
                         echo=True)
             except OSError as e:
                 log("---- bkill() in lsf_asq.py, execution of command failed (bkill -J %s) failed:  %s----"
                     % (test.jobname, e),
                     echo=True)
コード例 #28
0
def runCommand(cmd_line, file_name=None, exit=True, verbose=False):
    """
    Function to run a command and capture its output.
    """
    popen_args = shlex.split(cmd_line)

    log('runCommand command line: %s' % cmd_line, echo=verbose)

    try:

        if file_name is not None:
            if os.path.exists(file_name):
                stdout_pipe = open(file_name, 'a')
                stderr_pipe = open('%s.err' % file_name, 'a')
            else:
                stdout_pipe = open(file_name, 'w')
                stderr_pipe = open('%s.err' % file_name, 'w')
        else:
            stdout_pipe = PIPE
            stderr_pipe = PIPE

        (stdout_txt, stderr_txt) = Popen(popen_args,
                                         stdout=stdout_pipe,
                                         stderr=stderr_pipe,
                                         text=True).communicate()

        if file_name is not None:
            stdout_pipe.close()
            stderr_pipe.close()

    except CalledProcessError as error:
        log('Command failed: error code %d' % error.returncode, echo=True)
        log('Failed Command: %s' % cmd_line, echo=True)
        if exit:
            raise SystemExit(1)
    except OSError as error:
        log('Command failed with OSError: traceback %s' %
            error.child_traceback,
            echo=True)
        log('Failed Command: %s' % cmd_line, echo=True)
        if exit:
            raise SystemExit(1)

    return (stdout_txt, stderr_txt)
コード例 #29
0
def copyAndRenameFile(filename, newfilename, srcdir, destdir, groupID):
    srcfile = os.path.join(srcdir, filename)
    if os.path.isfile(srcfile):
        destfile = os.path.join(destdir, os.path.basename(newfilename))
        shutil.copyfile(srcfile, destfile)
        try:
            os.chown(destfile, -1, groupID)
            os.chmod(
                destfile,
                stat.S_IREAD | stat.S_IWRITE | stat.S_IRGRP | stat.S_IWGRP)
        except OSError as error:
            log('WARNING - failed to set permissions on %s: %s' %
                (destfile, error.strerror),
                echo=True)
        return destfile
    else:
        log("WARNING - copyAndRenameFile: %s file does not exist in %s." %
            (filename, srcdir),
            echo=True)
コード例 #30
0
    def noteLaunch(self, test):
        """A test has been launched."""
        # noteLaunch is called by machines.py (startRun() -> launch() -> noteLaunch()

        import math

        if test.useHTCNode:
            self.numHTCUsed += test.nodes
        else:
            self.npBusy += (
                math.ceil(test.nodes / float(self.numNodesPerTest)) *
                self.numNodesPerTest * self.numCPUPerNode)
        self.numberTestsRunning = self.numHTCUsed + (self.npBusy /
                                                     self.numNodesPerTest)

        if debug():
            log("dawnHTC.py:  Launched %s,\tnow running %d tests,\t#nodes used = %d,\t#htc used= %d" % \
                (test.name, self.numberTestsRunning, self.npBusy, self.numHTCUsed), echo=True)

        self.periodicReport()