def startStagerThread(self, job): self.__tmpLog.debug("Rank %s: initStagerThread: workdir: %s" % (self.__rank, os.getcwd())) try: from pandayoda.yodaexe.DroidStager import DroidStager self.__stagerThread = DroidStager(self.__globalWorkingDir, self.__localWorkingDir, outputs=self.__outputs, job=job, esJobManager=self.__esJobManager, outputDir=self.__outputDir, rank=self.__rank, logger=self.__tmpLog) self.__stagerThread.start() return 0, None except: self.__tmpLog.warning("Rank %s: Failed to initStagerThread: %s" % (self.__rank, str(traceback.format_exc()))) return -1, str(traceback.format_exc())
def startStagerThread(self, job): self.__tmpLog.debug("Rank %s: initStagerThread: workdir: %s" %(self.__rank, os.getcwd())) try: from pandayoda.yodaexe.DroidStager import DroidStager self.__stagerThread = DroidStager(self.__globalWorkingDir, self.__localWorkingDir, outputs=self.__outputs, job=job, esJobManager=self.__esJobManager, outputDir=self.__outputDir, rank=self.__rank, logger=self.__tmpLog) self.__stagerThread.start() return 0, None except: self.__tmpLog.warning("Rank %s: Failed to initStagerThread: %s" % (self.__rank, str(traceback.format_exc()))) return -1, str(traceback.format_exc())
class Droid(threading.Thread): def __init__(self, globalWorkingDir, localWorkingDir, rank=None, nonMPIMode=False, reserveCores=0, outputDir=None): threading.Thread.__init__(self) self.__globalWorkingDir = globalWorkingDir self.__localWorkingDir = localWorkingDir self.__currentDir = None self.__tmpLog = Logger.Logger(filename='Droid.log') self.__comm = Interaction.Requester(rank=rank, nonMPIMode=nonMPIMode, logger=self.__tmpLog) self.__esJobManager = None self.__isFinished = False if nonMPIMode: self.__rank = rank else: self.__rank = self.__comm.getRank() self.__tmpLog.info("Rank %s: Global working dir: %s" % (self.__rank, self.__globalWorkingDir)) if not os.environ.has_key('PilotHomeDir'): os.environ['PilotHomeDir'] = self.__globalWorkingDir self.initWorkingDir() self.__tmpLog.info("Rank %s: Current working dir: %s" % (self.__rank, self.__currentDir)) self.__jobId = None self.__startTimeOneJobDroid = None self.__cpuTimeOneJobDroid = None self.__poolFileCatalog = None self.__inputFiles = None self.__copyInputFiles = None self.__preSetup = None self.__postRun = None self.__ATHENA_PROC_NUMBER = 1 self.__firstGetEventRanges = True self.__outputDir = outputDir self.__yodaToOS = False self.reserveCores = reserveCores self.__hostname = socket.getfqdn() self.__outputs = Queue() self.__jobMetrics = {} self.__stagerThread = None self.__stop = False if not nonMPIMode: signal.signal(signal.SIGTERM, self.stop) signal.signal(signal.SIGQUIT, self.stop) signal.signal(signal.SIGSEGV, self.stop) signal.signal(signal.SIGXCPU, self.stop) signal.signal(signal.SIGUSR1, self.stop) signal.signal(signal.SIGBUS, self.stop) def initWorkingDir(self): # Create separate working directory for each rank curdir = _abspath(self.__localWorkingDir) wkdirname = "rank_%s" % str(self.__rank) wkdir = _abspath(_join(curdir, wkdirname)) if not os.path.exists(wkdir): os.makedirs(wkdir) os.chdir(wkdir) self.__currentDir = wkdir def postExecJob(self): if self.__copyInputFiles and self.__inputFiles is not None and self.__poolFileCatalog is not None: for inputFile in self.__inputFiles: localInputFile = os.path.join(os.getcwd(), os.path.basename(inputFile)) self.__tmpLog.debug("Rank %s: Remove input file: %s" % (self.__rank, localInputFile)) os.remove(localInputFile) if self.__globalWorkingDir != self.__localWorkingDir: command = "cp -fr " + self.__currentDir + " " + self.__globalWorkingDir self.__tmpLog.debug( "Rank %s: copy files from local working directory to global working dir(cmd: %s)" % (self.__rank, command)) status, output = commands.getstatusoutput(command) self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, status, output)) if self.__postRun and self.__esJobManager: self.__esJobManager.postRun(self.__postRun) def setup(self, job): try: self.__jobId = job.get("JobId", None) self.__startTimeOneJobDroid = time.time() self.__cpuTimeOneJobDroid = os.times() self.__poolFileCatalog = job.get('PoolFileCatalog', None) self.__inputFiles = job.get('InputFiles', None) self.__copyInputFiles = job.get('CopyInputFiles', False) self.__preSetup = job.get('PreSetup', None) self.__postRun = job.get('PostRun', None) self.__yodaToOS = job.get('yodaToOS', False) self.__ATHENA_PROC_NUMBER = int(job.get('ATHENA_PROC_NUMBER', 1)) self.__ATHENA_PROC_NUMBER -= self.reserveCores if self.__ATHENA_PROC_NUMBER < 0: self.__ATHENA_PROC_NUMBER = 1 job["AthenaMPCmd"] = "export ATHENA_PROC_NUMBER=" + str( self.__ATHENA_PROC_NUMBER) + "; " + job["AthenaMPCmd"] self.__jobWorkingDir = job.get('GlobalWorkingDir', None) if self.__jobWorkingDir: self.__jobWorkingDir = os.path.join(self.__jobWorkingDir, 'rank_%s' % self.__rank) if not os.path.exists(self.__jobWorkingDir): os.makedirs(self.__jobWorkingDir) os.chdir(self.__jobWorkingDir) logFile = os.path.join(self.__jobWorkingDir, 'Droid.log') logging.basicConfig(filename=logFile, level=logging.DEBUG) self.__tmpLog = Logger.Logger() if self.__copyInputFiles and self.__inputFiles is not None and self.__poolFileCatalog is not None: for inputFile in self.__inputFiles: shutil.copy(inputFile, './') pfc_name = os.path.basename(self.__poolFileCatalog) pfc_name = os.path.join(os.getcwd(), pfc_name) pfc_name_back = pfc_name + ".back" shutil.copy2(self.__poolFileCatalog, pfc_name_back) with open(pfc_name, 'wt') as pfc_out: with open(pfc_name_back, 'rt') as pfc_in: for line in pfc_in: pfc_out.write( line.replace('HPCWORKINGDIR', os.getcwd())) job["AthenaMPCmd"] = job["AthenaMPCmd"].replace( 'HPCWORKINGDIR', os.getcwd()) self.__esJobManager = EventServerJobManager( self.__rank, self.__ATHENA_PROC_NUMBER, workingDir=self.__jobWorkingDir) status, output = self.__esJobManager.preSetup(self.__preSetup) if status != 0: return False, output status, output = self.startStagerThread(job) if status != 0: self.__tmpLog.warning( "Rank %s: failed to start stager thread(status: %s, output: %s)" % (self.__rank, status, output)) return False, output # self.__esJobManager.initMessageThread(socketname='EventService_EventRanges', context='local') # self.__esJobManager.initTokenExtractorProcess(job["TokenExtractCmd"]) # self.__esJobManager.initAthenaMPProcess(job["AthenaMPCmd"]) ret = self.__esJobManager.init( socketname='EventService_EventRanges', context='local', athenaMPCmd=job["AthenaMPCmd"], tokenExtractorCmd=job["TokenExtractCmd"]) return True, None except: errMsg = "Failed to init EventServerJobManager: %s" % str( traceback.format_exc()) self.__esJobManager.terminate() return False, errMsg def getJob(self): request = {'Test': 'TEST', 'rank': self.__rank} self.__tmpLog.debug("Rank %s: getJob(request: %s)" % (self.__rank, request)) status, output = self.__comm.sendRequest('getJob', request) self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, status, output)) if status: statusCode = output["StatusCode"] job = output["job"] if statusCode == 0 and job: return True, job return False, None def startStagerThread(self, job): self.__tmpLog.debug("Rank %s: initStagerThread: workdir: %s" % (self.__rank, os.getcwd())) try: from pandayoda.yodaexe.DroidStager import DroidStager self.__stagerThread = DroidStager(self.__globalWorkingDir, self.__localWorkingDir, outputs=self.__outputs, job=job, esJobManager=self.__esJobManager, outputDir=self.__outputDir, rank=self.__rank, logger=self.__tmpLog) self.__stagerThread.start() return 0, None except: self.__tmpLog.warning("Rank %s: Failed to initStagerThread: %s" % (self.__rank, str(traceback.format_exc()))) return -1, str(traceback.format_exc()) def stopStagerThread(self): self.__tmpLog.debug("Rank %s: stopStagerThread: workdir: %s" % (self.__rank, os.getcwd())) self.__stagerThread.stop() self.__tmpLog.debug("Rank %s: waiting stager thread to finish" % (self.__rank)) while not self.__stagerThread.isFinished(): self.updateOutputs() time.sleep(1) self.__tmpLog.debug("Rank %s: stager thread finished" % (self.__rank)) def getEventRanges(self, nRanges=1): #if self.__firstGetEventRanges: # request = {'nRanges': self.__ATHENA_PROC_NUMBER} # self.__firstGetEventRanges = False #else: # request = {'nRanges': nRanges} request = {'jobId': self.__jobId, 'nRanges': nRanges} self.__tmpLog.debug("Rank %s: getEventRanges(request: %s)" % (self.__rank, request)) status, output = self.__comm.sendRequest('getEventRanges', request) self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, status, output)) if status: statusCode = output["StatusCode"] eventRanges = output['eventRanges'] if statusCode == 0: return True, eventRanges return False, None def updateEventRange(self, output): try: eventRangeID = output.split(",")[1] except Exception, e: self.__tmpLog.warning( "Rank %s: failed to get eventRangeID from output: %s" % (self.__rank, output)) self.__tmpLog.warning("Rank %s: error message: %s" % (self.__rank, str(e))) status, output = self.copyOutput(output) if status != 0: self.__tmpLog.debug( "Rank %s: failed to copy output from local working dir to global working dir: %s" % (self.__rank, output)) return False request = { "eventRangeID": eventRangeID, 'eventStatus': "finished", "output": output } self.__tmpLog.debug("Rank %s: updateEventRange(request: %s)" % (self.__rank, request)) retStatus, retOutput = self.__comm.sendRequest('updateEventRange', request) self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, retStatus, retOutput)) if retStatus: statusCode = retOutput["StatusCode"] if statusCode == 0: return True return False
class Droid(threading.Thread): def __init__(self, globalWorkingDir, localWorkingDir, rank=None, nonMPIMode=False, reserveCores=0, outputDir=None): threading.Thread.__init__(self) self.__globalWorkingDir = globalWorkingDir self.__localWorkingDir = localWorkingDir self.__currentDir = None self.__tmpLog = Logger.Logger(filename='Droid.log') self.__comm = Interaction.Requester(rank=rank, nonMPIMode=nonMPIMode, logger=self.__tmpLog) self.__esJobManager = None self.__isFinished = False if nonMPIMode: self.__rank = rank else: self.__rank = self.__comm.getRank() self.__tmpLog.info("Rank %s: Global working dir: %s" % (self.__rank, self.__globalWorkingDir)) if not os.environ.has_key('PilotHomeDir'): os.environ['PilotHomeDir'] = self.__globalWorkingDir self.initWorkingDir() self.__tmpLog.info("Rank %s: Current working dir: %s" % (self.__rank, self.__currentDir)) self.__jobId = None self.__startTimeOneJobDroid = None self.__cpuTimeOneJobDroid = None self.__poolFileCatalog = None self.__inputFiles = None self.__copyInputFiles = None self.__preSetup = None self.__postRun = None self.__ATHENA_PROC_NUMBER = 1 self.__firstGetEventRanges = True self.__outputDir = outputDir self.__yodaToOS = False self.reserveCores = reserveCores self.__hostname = socket.getfqdn() self.__outputs = Queue() self.__jobMetrics = {} self.__stagerThread = None self.__stop = False if not nonMPIMode: signal.signal(signal.SIGTERM, self.stop) signal.signal(signal.SIGQUIT, self.stop) signal.signal(signal.SIGSEGV, self.stop) signal.signal(signal.SIGXCPU, self.stop) signal.signal(signal.SIGUSR1, self.stop) signal.signal(signal.SIGBUS, self.stop) def initWorkingDir(self): # Create separate working directory for each rank curdir = _abspath (self.__localWorkingDir) wkdirname = "rank_%s" % str(self.__rank) wkdir = _abspath (_join(curdir,wkdirname)) if not os.path.exists(wkdir): os.makedirs (wkdir) os.chdir (wkdir) self.__currentDir = wkdir def postExecJob(self): if self.__copyInputFiles and self.__inputFiles is not None and self.__poolFileCatalog is not None: for inputFile in self.__inputFiles: localInputFile = os.path.join(os.getcwd(), os.path.basename(inputFile)) self.__tmpLog.debug("Rank %s: Remove input file: %s" % (self.__rank, localInputFile)) os.remove(localInputFile) if self.__globalWorkingDir != self.__localWorkingDir: command = "cp -fr " + self.__currentDir + " " + self.__globalWorkingDir self.__tmpLog.debug("Rank %s: copy files from local working directory to global working dir(cmd: %s)" % (self.__rank, command)) status, output = commands.getstatusoutput(command) self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, status, output)) if self.__postRun and self.__esJobManager: self.__esJobManager.postRun(self.__postRun) def setup(self, job): try: self.__jobId = job.get("JobId", None) self.__startTimeOneJobDroid = time.time() self.__cpuTimeOneJobDroid = os.times() self.__poolFileCatalog = job.get('PoolFileCatalog', None) self.__inputFiles = job.get('InputFiles', None) self.__copyInputFiles = job.get('CopyInputFiles', False) self.__preSetup = job.get('PreSetup', None) self.__postRun = job.get('PostRun', None) self.__yodaToOS = job.get('yodaToOS', False) self.__ATHENA_PROC_NUMBER = int(job.get('ATHENA_PROC_NUMBER', 1)) self.__ATHENA_PROC_NUMBER -= self.reserveCores if self.__ATHENA_PROC_NUMBER < 0: self.__ATHENA_PROC_NUMBER = 1 job["AthenaMPCmd"] = "export ATHENA_PROC_NUMBER=" + str(self.__ATHENA_PROC_NUMBER) + "; " + job["AthenaMPCmd"] self.__jobWorkingDir = job.get('GlobalWorkingDir', None) if self.__jobWorkingDir: self.__jobWorkingDir = os.path.join(self.__jobWorkingDir, 'rank_%s' % self.__rank) if not os.path.exists(self.__jobWorkingDir): os.makedirs(self.__jobWorkingDir) os.chdir(self.__jobWorkingDir) logFile = os.path.join(self.__jobWorkingDir, 'Droid.log') logging.basicConfig(filename=logFile, level=logging.DEBUG) self.__tmpLog = Logger.Logger() if self.__copyInputFiles and self.__inputFiles is not None and self.__poolFileCatalog is not None: for inputFile in self.__inputFiles: shutil.copy(inputFile, './') pfc_name = os.path.basename(self.__poolFileCatalog) pfc_name = os.path.join(os.getcwd(), pfc_name) pfc_name_back = pfc_name + ".back" shutil.copy2(self.__poolFileCatalog, pfc_name_back) with open(pfc_name, 'wt') as pfc_out: with open(pfc_name_back, 'rt') as pfc_in: for line in pfc_in: pfc_out.write(line.replace('HPCWORKINGDIR', os.getcwd())) job["AthenaMPCmd"] = job["AthenaMPCmd"].replace('HPCWORKINGDIR', os.getcwd()) self.__esJobManager = EventServerJobManager(self.__rank, self.__ATHENA_PROC_NUMBER, workingDir=self.__jobWorkingDir) status, output = self.__esJobManager.preSetup(self.__preSetup) if status != 0: return False, output status, output = self.startStagerThread(job) if status != 0: self.__tmpLog.warning("Rank %s: failed to start stager thread(status: %s, output: %s)" % (self.__rank, status, output)) return False, output # self.__esJobManager.initMessageThread(socketname='EventService_EventRanges', context='local') # self.__esJobManager.initTokenExtractorProcess(job["TokenExtractCmd"]) # self.__esJobManager.initAthenaMPProcess(job["AthenaMPCmd"]) ret = self.__esJobManager.init(socketname='EventService_EventRanges', context='local', athenaMPCmd=job["AthenaMPCmd"], tokenExtractorCmd=job["TokenExtractCmd"]) return True, None except: errMsg = "Failed to init EventServerJobManager: %s" % str(traceback.format_exc()) self.__esJobManager.terminate() return False, errMsg def getJob(self): request = {'Test':'TEST', 'rank': self.__rank} self.__tmpLog.debug("Rank %s: getJob(request: %s)" % (self.__rank, request)) status, output = self.__comm.sendRequest('getJob',request) self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, status, output)) if status: statusCode = output["StatusCode"] job = output["job"] if statusCode == 0 and job: return True, job return False, None def startStagerThread(self, job): self.__tmpLog.debug("Rank %s: initStagerThread: workdir: %s" %(self.__rank, os.getcwd())) try: from pandayoda.yodaexe.DroidStager import DroidStager self.__stagerThread = DroidStager(self.__globalWorkingDir, self.__localWorkingDir, outputs=self.__outputs, job=job, esJobManager=self.__esJobManager, outputDir=self.__outputDir, rank=self.__rank, logger=self.__tmpLog) self.__stagerThread.start() return 0, None except: self.__tmpLog.warning("Rank %s: Failed to initStagerThread: %s" % (self.__rank, str(traceback.format_exc()))) return -1, str(traceback.format_exc()) def stopStagerThread(self): self.__tmpLog.debug("Rank %s: stopStagerThread: workdir: %s" %(self.__rank, os.getcwd())) self.__stagerThread.stop() self.__tmpLog.debug("Rank %s: waiting stager thread to finish" %(self.__rank)) while not self.__stagerThread.isFinished(): self.updateOutputs() time.sleep(1) self.__tmpLog.debug("Rank %s: stager thread finished" %(self.__rank)) def getEventRanges(self, nRanges=1): #if self.__firstGetEventRanges: # request = {'nRanges': self.__ATHENA_PROC_NUMBER} # self.__firstGetEventRanges = False #else: # request = {'nRanges': nRanges} request = {'jobId': self.__jobId, 'nRanges': nRanges} self.__tmpLog.debug("Rank %s: getEventRanges(request: %s)" % (self.__rank, request)) status, output = self.__comm.sendRequest('getEventRanges',request) self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, status, output)) if status: statusCode = output["StatusCode"] eventRanges = output['eventRanges'] if statusCode == 0: return True, eventRanges return False, None def updateEventRange(self, output): try: eventRangeID = output.split(",")[1] except Exception, e: self.__tmpLog.warning("Rank %s: failed to get eventRangeID from output: %s" % (self.__rank, output)) self.__tmpLog.warning("Rank %s: error message: %s" % (self.__rank, str(e))) status, output = self.copyOutput(output) if status != 0: self.__tmpLog.debug("Rank %s: failed to copy output from local working dir to global working dir: %s" % (self.__rank, output)) return False request = {"eventRangeID": eventRangeID, 'eventStatus': "finished", "output": output} self.__tmpLog.debug("Rank %s: updateEventRange(request: %s)" % (self.__rank, request)) retStatus, retOutput = self.__comm.sendRequest('updateEventRange',request) self.__tmpLog.debug("Rank %s: (status: %s, output: %s)" % (self.__rank, retStatus, retOutput)) if retStatus: statusCode = retOutput["StatusCode"] if statusCode == 0: return True return False