コード例 #1
0
 def startStagerThread(self, job):
     self.__tmpLog.debug("Rank %s: initStagerThread: workdir: %s" %
                         (self.__rank, os.getcwd()))
     try:
         from pandayoda.yodaexe.DroidStager import DroidStager
         self.__stagerThread = DroidStager(self.__globalWorkingDir,
                                           self.__localWorkingDir,
                                           outputs=self.__outputs,
                                           job=job,
                                           esJobManager=self.__esJobManager,
                                           outputDir=self.__outputDir,
                                           rank=self.__rank,
                                           logger=self.__tmpLog)
         self.__stagerThread.start()
         return 0, None
     except:
         self.__tmpLog.warning("Rank %s: Failed to initStagerThread: %s" %
                               (self.__rank, str(traceback.format_exc())))
         return -1, str(traceback.format_exc())
コード例 #2
0
ファイル: Droid.py プロジェクト: PanDAWMS/pilot
 def startStagerThread(self, job):
     self.__tmpLog.debug("Rank %s: initStagerThread: workdir: %s" %(self.__rank, os.getcwd()))
     try:
         from pandayoda.yodaexe.DroidStager import DroidStager
         self.__stagerThread = DroidStager(self.__globalWorkingDir, self.__localWorkingDir, outputs=self.__outputs, job=job, esJobManager=self.__esJobManager, outputDir=self.__outputDir, rank=self.__rank, logger=self.__tmpLog)
         self.__stagerThread.start()
         return 0, None
     except:
         self.__tmpLog.warning("Rank %s: Failed to initStagerThread: %s" % (self.__rank, str(traceback.format_exc())))
         return -1, str(traceback.format_exc())
コード例 #3
0
class Droid(threading.Thread):
    def __init__(self,
                 globalWorkingDir,
                 localWorkingDir,
                 rank=None,
                 nonMPIMode=False,
                 reserveCores=0,
                 outputDir=None):
        threading.Thread.__init__(self)
        self.__globalWorkingDir = globalWorkingDir
        self.__localWorkingDir = localWorkingDir
        self.__currentDir = None
        self.__tmpLog = Logger.Logger(filename='Droid.log')
        self.__comm = Interaction.Requester(rank=rank,
                                            nonMPIMode=nonMPIMode,
                                            logger=self.__tmpLog)
        self.__esJobManager = None
        self.__isFinished = False
        if nonMPIMode:
            self.__rank = rank
        else:
            self.__rank = self.__comm.getRank()
        self.__tmpLog.info("Rank %s: Global working dir: %s" %
                           (self.__rank, self.__globalWorkingDir))
        if not os.environ.has_key('PilotHomeDir'):
            os.environ['PilotHomeDir'] = self.__globalWorkingDir

        self.initWorkingDir()
        self.__tmpLog.info("Rank %s: Current working dir: %s" %
                           (self.__rank, self.__currentDir))

        self.__jobId = None
        self.__startTimeOneJobDroid = None
        self.__cpuTimeOneJobDroid = None
        self.__poolFileCatalog = None
        self.__inputFiles = None
        self.__copyInputFiles = None
        self.__preSetup = None
        self.__postRun = None
        self.__ATHENA_PROC_NUMBER = 1
        self.__firstGetEventRanges = True
        self.__outputDir = outputDir

        self.__yodaToOS = False

        self.reserveCores = reserveCores
        self.__hostname = socket.getfqdn()

        self.__outputs = Queue()
        self.__jobMetrics = {}
        self.__stagerThread = None

        self.__stop = False

        if not nonMPIMode:
            signal.signal(signal.SIGTERM, self.stop)
            signal.signal(signal.SIGQUIT, self.stop)
            signal.signal(signal.SIGSEGV, self.stop)
            signal.signal(signal.SIGXCPU, self.stop)
            signal.signal(signal.SIGUSR1, self.stop)
            signal.signal(signal.SIGBUS, self.stop)

    def initWorkingDir(self):
        # Create separate working directory for each rank
        curdir = _abspath(self.__localWorkingDir)
        wkdirname = "rank_%s" % str(self.__rank)
        wkdir = _abspath(_join(curdir, wkdirname))
        if not os.path.exists(wkdir):
            os.makedirs(wkdir)
        os.chdir(wkdir)
        self.__currentDir = wkdir

    def postExecJob(self):
        if self.__copyInputFiles and self.__inputFiles is not None and self.__poolFileCatalog is not None:
            for inputFile in self.__inputFiles:
                localInputFile = os.path.join(os.getcwd(),
                                              os.path.basename(inputFile))
                self.__tmpLog.debug("Rank %s: Remove input file: %s" %
                                    (self.__rank, localInputFile))
                os.remove(localInputFile)

        if self.__globalWorkingDir != self.__localWorkingDir:
            command = "cp -fr " + self.__currentDir + " " + self.__globalWorkingDir
            self.__tmpLog.debug(
                "Rank %s: copy files from local working directory to global working dir(cmd: %s)"
                % (self.__rank, command))
            status, output = commands.getstatusoutput(command)
            self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" %
                                (self.__rank, status, output))

        if self.__postRun and self.__esJobManager:
            self.__esJobManager.postRun(self.__postRun)

    def setup(self, job):
        try:
            self.__jobId = job.get("JobId", None)
            self.__startTimeOneJobDroid = time.time()
            self.__cpuTimeOneJobDroid = os.times()
            self.__poolFileCatalog = job.get('PoolFileCatalog', None)
            self.__inputFiles = job.get('InputFiles', None)
            self.__copyInputFiles = job.get('CopyInputFiles', False)
            self.__preSetup = job.get('PreSetup', None)
            self.__postRun = job.get('PostRun', None)

            self.__yodaToOS = job.get('yodaToOS', False)

            self.__ATHENA_PROC_NUMBER = int(job.get('ATHENA_PROC_NUMBER', 1))
            self.__ATHENA_PROC_NUMBER -= self.reserveCores
            if self.__ATHENA_PROC_NUMBER < 0:
                self.__ATHENA_PROC_NUMBER = 1
            job["AthenaMPCmd"] = "export ATHENA_PROC_NUMBER=" + str(
                self.__ATHENA_PROC_NUMBER) + "; " + job["AthenaMPCmd"]
            self.__jobWorkingDir = job.get('GlobalWorkingDir', None)
            if self.__jobWorkingDir:
                self.__jobWorkingDir = os.path.join(self.__jobWorkingDir,
                                                    'rank_%s' % self.__rank)
                if not os.path.exists(self.__jobWorkingDir):
                    os.makedirs(self.__jobWorkingDir)
                os.chdir(self.__jobWorkingDir)
                logFile = os.path.join(self.__jobWorkingDir, 'Droid.log')
                logging.basicConfig(filename=logFile, level=logging.DEBUG)
                self.__tmpLog = Logger.Logger()

            if self.__copyInputFiles and self.__inputFiles is not None and self.__poolFileCatalog is not None:
                for inputFile in self.__inputFiles:
                    shutil.copy(inputFile, './')

                pfc_name = os.path.basename(self.__poolFileCatalog)
                pfc_name = os.path.join(os.getcwd(), pfc_name)
                pfc_name_back = pfc_name + ".back"
                shutil.copy2(self.__poolFileCatalog, pfc_name_back)
                with open(pfc_name, 'wt') as pfc_out:
                    with open(pfc_name_back, 'rt') as pfc_in:
                        for line in pfc_in:
                            pfc_out.write(
                                line.replace('HPCWORKINGDIR', os.getcwd()))

                job["AthenaMPCmd"] = job["AthenaMPCmd"].replace(
                    'HPCWORKINGDIR', os.getcwd())

            self.__esJobManager = EventServerJobManager(
                self.__rank,
                self.__ATHENA_PROC_NUMBER,
                workingDir=self.__jobWorkingDir)
            status, output = self.__esJobManager.preSetup(self.__preSetup)
            if status != 0:
                return False, output

            status, output = self.startStagerThread(job)
            if status != 0:
                self.__tmpLog.warning(
                    "Rank %s: failed to start stager thread(status: %s, output: %s)"
                    % (self.__rank, status, output))
                return False, output

            # self.__esJobManager.initMessageThread(socketname='EventService_EventRanges', context='local')
            # self.__esJobManager.initTokenExtractorProcess(job["TokenExtractCmd"])
            # self.__esJobManager.initAthenaMPProcess(job["AthenaMPCmd"])
            ret = self.__esJobManager.init(
                socketname='EventService_EventRanges',
                context='local',
                athenaMPCmd=job["AthenaMPCmd"],
                tokenExtractorCmd=job["TokenExtractCmd"])
            return True, None
        except:
            errMsg = "Failed to init EventServerJobManager: %s" % str(
                traceback.format_exc())
            self.__esJobManager.terminate()
            return False, errMsg

    def getJob(self):
        request = {'Test': 'TEST', 'rank': self.__rank}
        self.__tmpLog.debug("Rank %s: getJob(request: %s)" %
                            (self.__rank, request))
        status, output = self.__comm.sendRequest('getJob', request)
        self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" %
                            (self.__rank, status, output))
        if status:
            statusCode = output["StatusCode"]
            job = output["job"]
            if statusCode == 0 and job:
                return True, job
        return False, None

    def startStagerThread(self, job):
        self.__tmpLog.debug("Rank %s: initStagerThread: workdir: %s" %
                            (self.__rank, os.getcwd()))
        try:
            from pandayoda.yodaexe.DroidStager import DroidStager
            self.__stagerThread = DroidStager(self.__globalWorkingDir,
                                              self.__localWorkingDir,
                                              outputs=self.__outputs,
                                              job=job,
                                              esJobManager=self.__esJobManager,
                                              outputDir=self.__outputDir,
                                              rank=self.__rank,
                                              logger=self.__tmpLog)
            self.__stagerThread.start()
            return 0, None
        except:
            self.__tmpLog.warning("Rank %s: Failed to initStagerThread: %s" %
                                  (self.__rank, str(traceback.format_exc())))
            return -1, str(traceback.format_exc())

    def stopStagerThread(self):
        self.__tmpLog.debug("Rank %s: stopStagerThread: workdir: %s" %
                            (self.__rank, os.getcwd()))
        self.__stagerThread.stop()
        self.__tmpLog.debug("Rank %s: waiting stager thread to finish" %
                            (self.__rank))
        while not self.__stagerThread.isFinished():
            self.updateOutputs()
            time.sleep(1)
        self.__tmpLog.debug("Rank %s: stager thread finished" % (self.__rank))

    def getEventRanges(self, nRanges=1):
        #if self.__firstGetEventRanges:
        #    request = {'nRanges': self.__ATHENA_PROC_NUMBER}
        #    self.__firstGetEventRanges = False
        #else:
        #    request = {'nRanges': nRanges}
        request = {'jobId': self.__jobId, 'nRanges': nRanges}
        self.__tmpLog.debug("Rank %s: getEventRanges(request: %s)" %
                            (self.__rank, request))
        status, output = self.__comm.sendRequest('getEventRanges', request)
        self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" %
                            (self.__rank, status, output))
        if status:
            statusCode = output["StatusCode"]
            eventRanges = output['eventRanges']
            if statusCode == 0:
                return True, eventRanges
        return False, None

    def updateEventRange(self, output):
        try:
            eventRangeID = output.split(",")[1]
        except Exception, e:
            self.__tmpLog.warning(
                "Rank %s: failed to get eventRangeID from output: %s" %
                (self.__rank, output))
            self.__tmpLog.warning("Rank %s: error message: %s" %
                                  (self.__rank, str(e)))
        status, output = self.copyOutput(output)
        if status != 0:
            self.__tmpLog.debug(
                "Rank %s: failed to copy output from local working dir to global working dir: %s"
                % (self.__rank, output))
            return False
        request = {
            "eventRangeID": eventRangeID,
            'eventStatus': "finished",
            "output": output
        }
        self.__tmpLog.debug("Rank %s: updateEventRange(request: %s)" %
                            (self.__rank, request))
        retStatus, retOutput = self.__comm.sendRequest('updateEventRange',
                                                       request)
        self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" %
                            (self.__rank, retStatus, retOutput))
        if retStatus:
            statusCode = retOutput["StatusCode"]
            if statusCode == 0:
                return True
        return False
コード例 #4
0
ファイル: Droid.py プロジェクト: PanDAWMS/pilot
class Droid(threading.Thread):
    def __init__(self, globalWorkingDir, localWorkingDir, rank=None, nonMPIMode=False, reserveCores=0, outputDir=None):
        threading.Thread.__init__(self)
        self.__globalWorkingDir = globalWorkingDir
        self.__localWorkingDir = localWorkingDir
        self.__currentDir = None
        self.__tmpLog = Logger.Logger(filename='Droid.log')
        self.__comm = Interaction.Requester(rank=rank, nonMPIMode=nonMPIMode, logger=self.__tmpLog)
        self.__esJobManager = None
        self.__isFinished = False
        if nonMPIMode:
            self.__rank = rank
        else:
            self.__rank = self.__comm.getRank()
        self.__tmpLog.info("Rank %s: Global working dir: %s" % (self.__rank, self.__globalWorkingDir))
        if not os.environ.has_key('PilotHomeDir'):
            os.environ['PilotHomeDir'] = self.__globalWorkingDir

        self.initWorkingDir()
        self.__tmpLog.info("Rank %s: Current working dir: %s" % (self.__rank, self.__currentDir))

        self.__jobId = None
        self.__startTimeOneJobDroid = None
        self.__cpuTimeOneJobDroid = None
        self.__poolFileCatalog = None
        self.__inputFiles = None
        self.__copyInputFiles = None
        self.__preSetup = None
        self.__postRun = None
        self.__ATHENA_PROC_NUMBER = 1
        self.__firstGetEventRanges = True
        self.__outputDir = outputDir

        self.__yodaToOS = False

        self.reserveCores = reserveCores
        self.__hostname = socket.getfqdn()

        self.__outputs = Queue()
        self.__jobMetrics = {}
        self.__stagerThread = None

        self.__stop = False

        if not nonMPIMode:
            signal.signal(signal.SIGTERM, self.stop)
            signal.signal(signal.SIGQUIT, self.stop)
            signal.signal(signal.SIGSEGV, self.stop)
            signal.signal(signal.SIGXCPU, self.stop)
            signal.signal(signal.SIGUSR1, self.stop)
            signal.signal(signal.SIGBUS, self.stop)

    def initWorkingDir(self):
        # Create separate working directory for each rank
        curdir = _abspath (self.__localWorkingDir)
        wkdirname = "rank_%s" % str(self.__rank)
        wkdir  = _abspath (_join(curdir,wkdirname))
        if not os.path.exists(wkdir):
             os.makedirs (wkdir)
        os.chdir (wkdir)
        self.__currentDir = wkdir

    def postExecJob(self):
        if self.__copyInputFiles and self.__inputFiles is not None and self.__poolFileCatalog is not None:
            for inputFile in self.__inputFiles:
                localInputFile = os.path.join(os.getcwd(), os.path.basename(inputFile))
                self.__tmpLog.debug("Rank %s: Remove input file: %s" % (self.__rank, localInputFile))
                os.remove(localInputFile)

        if self.__globalWorkingDir != self.__localWorkingDir:
            command = "cp -fr " + self.__currentDir + " " + self.__globalWorkingDir
            self.__tmpLog.debug("Rank %s: copy files from local working directory to global working dir(cmd: %s)" % (self.__rank, command))
            status, output = commands.getstatusoutput(command)
            self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, status, output))

        if self.__postRun and self.__esJobManager:
            self.__esJobManager.postRun(self.__postRun)

    def setup(self, job):
        try:
            self.__jobId = job.get("JobId", None)
            self.__startTimeOneJobDroid = time.time()
            self.__cpuTimeOneJobDroid = os.times()
            self.__poolFileCatalog = job.get('PoolFileCatalog', None)
            self.__inputFiles = job.get('InputFiles', None)
            self.__copyInputFiles = job.get('CopyInputFiles', False)
            self.__preSetup = job.get('PreSetup', None)
            self.__postRun = job.get('PostRun', None)

            self.__yodaToOS = job.get('yodaToOS', False)

            self.__ATHENA_PROC_NUMBER = int(job.get('ATHENA_PROC_NUMBER', 1))
            self.__ATHENA_PROC_NUMBER -= self.reserveCores
            if self.__ATHENA_PROC_NUMBER < 0:
                self.__ATHENA_PROC_NUMBER = 1
            job["AthenaMPCmd"] = "export ATHENA_PROC_NUMBER=" + str(self.__ATHENA_PROC_NUMBER) + "; " + job["AthenaMPCmd"]
            self.__jobWorkingDir = job.get('GlobalWorkingDir', None)
            if self.__jobWorkingDir:
                self.__jobWorkingDir = os.path.join(self.__jobWorkingDir, 'rank_%s' % self.__rank)
                if not os.path.exists(self.__jobWorkingDir):
                    os.makedirs(self.__jobWorkingDir)
                os.chdir(self.__jobWorkingDir)
                logFile = os.path.join(self.__jobWorkingDir, 'Droid.log')
                logging.basicConfig(filename=logFile, level=logging.DEBUG)
                self.__tmpLog = Logger.Logger()

            if self.__copyInputFiles and self.__inputFiles is not None and self.__poolFileCatalog is not None:
                for inputFile in self.__inputFiles:
                    shutil.copy(inputFile, './')

                pfc_name = os.path.basename(self.__poolFileCatalog)
                pfc_name = os.path.join(os.getcwd(), pfc_name)
                pfc_name_back = pfc_name + ".back"
                shutil.copy2(self.__poolFileCatalog, pfc_name_back)
                with open(pfc_name, 'wt') as pfc_out:
                    with open(pfc_name_back, 'rt') as pfc_in:
                        for line in pfc_in:
                            pfc_out.write(line.replace('HPCWORKINGDIR', os.getcwd()))
                    
                job["AthenaMPCmd"] = job["AthenaMPCmd"].replace('HPCWORKINGDIR', os.getcwd())

            self.__esJobManager = EventServerJobManager(self.__rank, self.__ATHENA_PROC_NUMBER, workingDir=self.__jobWorkingDir)
            status, output = self.__esJobManager.preSetup(self.__preSetup)
            if status != 0:
                return False, output

            status, output = self.startStagerThread(job)
            if status != 0:
                self.__tmpLog.warning("Rank %s: failed to start stager thread(status: %s, output: %s)" % (self.__rank, status, output))
                return False, output

            # self.__esJobManager.initMessageThread(socketname='EventService_EventRanges', context='local')
            # self.__esJobManager.initTokenExtractorProcess(job["TokenExtractCmd"])
            # self.__esJobManager.initAthenaMPProcess(job["AthenaMPCmd"])
            ret = self.__esJobManager.init(socketname='EventService_EventRanges', context='local', athenaMPCmd=job["AthenaMPCmd"], tokenExtractorCmd=job["TokenExtractCmd"])
            return True, None
        except:
            errMsg = "Failed to init EventServerJobManager: %s" % str(traceback.format_exc())
            self.__esJobManager.terminate()
            return False, errMsg

    def getJob(self):
        request = {'Test':'TEST', 'rank': self.__rank}
        self.__tmpLog.debug("Rank %s: getJob(request: %s)" % (self.__rank, request))
        status, output = self.__comm.sendRequest('getJob',request)
        self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, status, output))
        if status:
            statusCode = output["StatusCode"]
            job = output["job"]
            if statusCode == 0 and job:
                return True, job
        return False, None

    def startStagerThread(self, job):
        self.__tmpLog.debug("Rank %s: initStagerThread: workdir: %s" %(self.__rank, os.getcwd()))
        try:
            from pandayoda.yodaexe.DroidStager import DroidStager
            self.__stagerThread = DroidStager(self.__globalWorkingDir, self.__localWorkingDir, outputs=self.__outputs, job=job, esJobManager=self.__esJobManager, outputDir=self.__outputDir, rank=self.__rank, logger=self.__tmpLog)
            self.__stagerThread.start()
            return 0, None
        except:
            self.__tmpLog.warning("Rank %s: Failed to initStagerThread: %s" % (self.__rank, str(traceback.format_exc())))
            return -1, str(traceback.format_exc())

    def stopStagerThread(self):
        self.__tmpLog.debug("Rank %s: stopStagerThread: workdir: %s" %(self.__rank, os.getcwd()))
        self.__stagerThread.stop()
        self.__tmpLog.debug("Rank %s: waiting stager thread to finish" %(self.__rank))
        while not self.__stagerThread.isFinished():
            self.updateOutputs()
            time.sleep(1)
        self.__tmpLog.debug("Rank %s: stager thread finished" %(self.__rank))

    def getEventRanges(self, nRanges=1):
        #if self.__firstGetEventRanges:
        #    request = {'nRanges': self.__ATHENA_PROC_NUMBER}
        #    self.__firstGetEventRanges = False
        #else:
        #    request = {'nRanges': nRanges}
        request = {'jobId': self.__jobId, 'nRanges': nRanges}
        self.__tmpLog.debug("Rank %s: getEventRanges(request: %s)" % (self.__rank, request))
        status, output = self.__comm.sendRequest('getEventRanges',request)
        self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, status, output))
        if status:
            statusCode = output["StatusCode"]
            eventRanges = output['eventRanges']
            if statusCode == 0:
                return True, eventRanges
        return False, None

    def updateEventRange(self, output):
        try:
            eventRangeID = output.split(",")[1]
        except Exception, e:
            self.__tmpLog.warning("Rank %s: failed to get eventRangeID from output: %s" % (self.__rank, output))
            self.__tmpLog.warning("Rank %s: error message: %s" % (self.__rank, str(e)))
        status, output = self.copyOutput(output)
        if status != 0:
            self.__tmpLog.debug("Rank %s: failed to copy output from local working dir to global working dir: %s" % (self.__rank, output))
            return False
        request = {"eventRangeID": eventRangeID,
                   'eventStatus': "finished",
                   "output": output}
        self.__tmpLog.debug("Rank %s: updateEventRange(request: %s)" % (self.__rank, request))
        retStatus, retOutput = self.__comm.sendRequest('updateEventRange',request)
        self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, retStatus, retOutput))
        if retStatus:
            statusCode = retOutput["StatusCode"]
            if statusCode == 0:
                return True
        return False