def getPluginClass(self, tmpVO): # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins', tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) return adderPluginClass
def getPluginClass(self, tmpVO): # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) return adderPluginClass
def run(self): try: # make a message instance tmpLog = LogWrapper(_logger,None) # run main procedure in the same process if not self.forkRun: tmpLog.debug('main start') tmpLog.debug('firstSubmission={0}'.format(self.firstSubmission)) # group jobs per VO voJobsMap = {} ddmFreeJobs = [] tmpLog.debug('{0} jobs in total'.format(len(self.jobs))) for tmpJob in self.jobs: # set VO=local for DDM free if tmpJob.destinationSE == 'local': tmpVO = 'local' else: tmpVO = tmpJob.VO # make map if not voJobsMap.has_key(tmpVO): voJobsMap[tmpVO] = [] voJobsMap[tmpVO].append(tmpJob) # loop over all VOs for tmpVO,tmpJobList in voJobsMap.iteritems(): tmpLog.debug('vo={0} has {1} jobs'.format(tmpVO,len(tmpJobList))) # get plugin setupperPluginClass = panda_config.getPlugin('setupper_plugins',tmpVO) if setupperPluginClass == None: # use ATLAS plug-in by default from SetupperAtlasPlugin import SetupperAtlasPlugin setupperPluginClass = SetupperAtlasPlugin tmpLog.debug('plugin name -> {0}'.format(setupperPluginClass.__name__)) try: # make plugin setupperPlugin = setupperPluginClass(self.taskBuffer,self.jobs,tmpLog, resubmit=self.resubmit, pandaDDM=self.pandaDDM, ddmAttempt=self.ddmAttempt, onlyTA=self.onlyTA, firstSubmission=self.firstSubmission) # run plugin tmpLog.debug('run plugin') setupperPlugin.run() # go forward if not TA if not self.onlyTA: # update jobs tmpLog.debug('update jobs') self.updateJobs(setupperPlugin.jobs+setupperPlugin.jumboJobs,tmpLog) # execute post process tmpLog.debug('post execute plugin') setupperPlugin.postRun() tmpLog.debug('done plugin') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('plugin failed with {0}:{1}'.format(errtype, errvalue)) tmpLog.debug('main end') else: tmpLog.debug('fork start') # write jobs to file import os import cPickle as pickle outFileName = '%s/set.%s_%s' % (panda_config.logdir,self.jobs[0].PandaID,commands.getoutput('uuidgen')) outFile = open(outFileName,'w') pickle.dump(self.jobs,outFile) outFile.close() # run main procedure in another process because python doesn't release memory com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd) com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, panda_config.pandaPython_dir,outFileName) if self.onlyTA: com += " -t" if not self.firstSubmission: com += " -f" tmpLog.debug(com) # exeute status,output = self.taskBuffer.processLimiter.getstatusoutput(com) tmpLog.debug("return from main process: %s %s" % (status,output)) tmpLog.debug('fork end') except: errtype,errvalue = sys.exc_info()[:2] tmpLog.error('master failed with {0}:{1}'.format(errtype,errvalue))
def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus, self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID], fromDefined=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in [ 'finished', 'failed', 'unknown', 'merging' ]: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr, self.attemptNr)) else: # check file status in JEDI if not self.job.isCancelled( ) and not self.job.taskBufferErrorCode in [ taskbuffer.ErrorCode.EC_PilotRetried ]: fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI( self.job) self.logger.debug("check file status in JEDI : {0}".format( fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError, 'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "wrong file status in source database" self.logger.debug( "set jobStatus={0} since input is inconsistent between Panda and JEDI" .format(self.jobStatus)) elif self.job.jobSubStatus in ['pilot_closed']: # terminated by the pilot self.logger.debug( "going to closed since terminated by the pilot") retClosed = self.taskBuffer.killJobs([self.jobID], 'pilot', '60', True) if retClosed[0] == True: self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() return # check for cloned jobs if EventServiceUtils.isJobCloningJob(self.job): checkJC = self.taskBuffer.checkClonedJob(self.job) if checkJC == None: raise RuntimeError, 'failed to check the cloned job' # failed to lock semaphore if checkJC['lock'] == False: self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "failed to lock semaphore for job cloning" self.logger.debug( "set jobStatus={0} since did not get semaphore for job cloning" .format(self.jobStatus)) # use failed for cancelled/closed jobs if self.job.isCancelled(): self.jobStatus = 'failed' # reset error codes to skip retrial module self.job.pilotErrorCode = 0 self.job.exeErrorCode = 0 self.job.ddmErrorCode = 0 # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # set VO=local for DDM free if self.job.destinationSE == 'local': tmpVO = 'local' else: tmpVO = self.job.VO # instantiate concrete plugin adderPluginClass = panda_config.getPlugin( 'adder_plugins', tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format( adderPluginClass.__name__)) adderPlugin = adderPluginClass( self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype, errvalue = sys.exc_info()[:2] self.logger.error( "failed to execute AdderPlugin for VO={0} with {1}:{2}" .format(tmpVO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary( ): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': # First of all: check if job failed and in this case take first actions according to error table source, error_code, error_diag = None, None, None if self.job.pilotErrorCode: source = 'pilotErrorCode' error_code = self.job.pilotErrorCode error_diag = self.job.pilotErrorDiag elif self.job.exeErrorCode: source = 'exeErrorCode' error_code = self.job.exeErrorCode error_diag = self.job.exeErrorDiag elif self.job.ddmErrorCode: source = 'ddmErrorCode' error_code = self.job.ddmErrorCode error_diag = self.job.ddmErrorDiag _logger.debug( "updatejob has source %s, error_code %s and error_diag %s" % (source, error_code, error_diag)) if source and error_code: try: self.logger.debug( "AdderGen.run will call apply_retrial_rules") retryModule.apply_retrial_rules( self.taskBuffer, self.job.PandaID, source, error_code, error_diag, self.job.attemptNr) self.logger.debug("apply_retrial_rules is back") except Exception as e: self.logger.debug( "apply_retrial_rules excepted and needs to be investigated (%s)" % (e)) self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output', 'log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' # propagate transition to prodDB self.job.stateChangeTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime == 'NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job if oldJobStatus in ['cancelled', 'closed']: pass else: self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs( [self.job], False, oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error('failed to update DB') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("cannot unlock XML") return # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.isCancelled()): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['', None, 'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({ 'lfn': baseLFN, 'guid': file.GUID, 'type': file.type, 'checksum': file.checksum, 'md5sum': file.md5sum, 'fsize': file.fsize, 'scope': file.scope }) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr( adderPlugin, 'datasetMap' ) and adderPlugin.datasetMap != {}: cThr = Closer.Closer( self.taskBuffer, destDBList, self.job, datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer, destDBList, self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") # run closer for assocaiate parallel jobs if EventServiceUtils.isJobCloningJob(self.job): assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer( self.job.jediTaskID, self.job.PandaID, destDBList) for assJobID, assDBlocks in assDBlockMap.iteritems( ): assJob = self.taskBuffer.peekJobs( [assJobID], fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] if self.job == None: self.logger.debug( ': associated job PandaID={0} not found in DB' .format(assJobID)) else: cThr = Closer.Closer( self.taskBuffer, assDBlocks, assJob) self.logger.debug( "start Closer for PandaID={0}".format( assJobID)) cThr.start() cThr.join() self.logger.debug( "end Closer for PandaID={0}".format( assJobID)) self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type, value)) self.logger.debug("cannot unlock XML")
def run(self): try: # make a message instance tmpLog = LogWrapper(_logger, None) # run main procedure in the same process if not self.forkRun: tmpLog.debug('main start') tmpLog.debug('firstSubmission={0}'.format( self.firstSubmission)) # group jobs per VO voJobsMap = {} ddmFreeJobs = [] tmpLog.debug('{0} jobs in total'.format(len(self.jobs))) for tmpJob in self.jobs: # set VO=local for DDM free if tmpJob.destinationSE == 'local': tmpVO = 'local' else: tmpVO = tmpJob.VO # make map if not voJobsMap.has_key(tmpVO): voJobsMap[tmpVO] = [] voJobsMap[tmpVO].append(tmpJob) # loop over all VOs for tmpVO, tmpJobList in voJobsMap.iteritems(): tmpLog.debug('vo={0} has {1} jobs'.format( tmpVO, len(tmpJobList))) # get plugin setupperPluginClass = panda_config.getPlugin( 'setupper_plugins', tmpVO) if setupperPluginClass == None: # use ATLAS plug-in by default from SetupperAtlasPlugin import SetupperAtlasPlugin setupperPluginClass = SetupperAtlasPlugin tmpLog.debug('plugin name -> {0}'.format( setupperPluginClass.__name__)) try: # make plugin setupperPlugin = setupperPluginClass( self.taskBuffer, self.jobs, tmpLog, resubmit=self.resubmit, pandaDDM=self.pandaDDM, ddmAttempt=self.ddmAttempt, onlyTA=self.onlyTA, firstSubmission=self.firstSubmission) # run plugin tmpLog.debug('run plugin') setupperPlugin.run() # go forward if not TA if not self.onlyTA: # update jobs tmpLog.debug('update jobs') self.updateJobs( setupperPlugin.jobs + setupperPlugin.jumboJobs, tmpLog) # execute post process tmpLog.debug('post execute plugin') setupperPlugin.postRun() tmpLog.debug('done plugin') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('plugin failed with {0}:{1}'.format( errtype, errvalue)) tmpLog.debug('main end') else: tmpLog.debug('fork start') # write jobs to file import os import cPickle as pickle outFileName = '%s/set.%s_%s' % (panda_config.logdir, self.jobs[0].PandaID, commands.getoutput('uuidgen')) outFile = open(outFileName, 'w') pickle.dump(self.jobs, outFile) outFile.close() # run main procedure in another process because python doesn't release memory com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % ( panda_config.home_dir_cwd, panda_config.home_dir_cwd) com += 'source %s; ' % panda_config.glite_source com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \ (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python, panda_config.pandaPython_dir,outFileName) if self.onlyTA: com += " -t" if not self.firstSubmission: com += " -f" tmpLog.debug(com) # exeute status, output = self.taskBuffer.processLimiter.getstatusoutput( com) tmpLog.debug("return from main process: %s %s" % (status, output)) tmpLog.debug('fork end') except: errtype, errvalue = sys.exc_info()[:2] tmpLog.error('master failed with {0}:{1}'.format( errtype, errvalue))
def run(self): try: _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus)) flagComplete = True topUserDsList = [] usingMerger = False disableNotifier = False firstIndvDS = True finalStatusDS = [] for destinationDBlock in self.destinationDBlocks: dsList = [] _logger.debug('%s start %s' % (self.pandaID,destinationDBlock)) # ignore tid datasets if re.search('_tid[\d_]+$',destinationDBlock): _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock)) continue # ignore HC datasets if re.search('^hc_test\.',destinationDBlock) != None or re.search('^user\.gangarbt\.',destinationDBlock) != None: if re.search('_sub\d+$',destinationDBlock) == None and re.search('\.lib$',destinationDBlock) == None: _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock)) continue # query dataset if self.datasetMap.has_key(destinationDBlock): dataset = self.datasetMap[destinationDBlock] else: dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock}) if dataset == None: _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock)) flagComplete = False continue # skip tobedeleted/tobeclosed if dataset.status in ['cleanup','tobeclosed','completed','deleted']: _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status)) continue dsList.append(dataset) # sort dsList.sort() # count number of completed files notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock, 'status':'unknown'}) if notFinish < 0: _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish)) flagComplete = False continue # check if completed _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish)) if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']: # close non-DQ2 destinationDBlock immediately finalStatus = 'closed' elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock): # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI finalStatus = 'closed' elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \ and self.job.processingType != 'usermerge': # merge output files if firstIndvDS: # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS finalStatus = 'tobemerged' firstIndvDS = False else: finalStatus = 'tobeclosed' # set merging to top dataset usingMerger = True # disable Notifier disableNotifier = True elif self.job.produceUnMerge(): finalStatus = 'doing' else: # set status to 'tobeclosed' to trigger DQ2 closing finalStatus = 'tobeclosed' if notFinish == 0 and EventServiceUtils.isEventServiceMerge(self.job): allInJobsetFinished = self.checkSubDatasetsInJobset() else: allInJobsetFinished = True if notFinish == 0 and allInJobsetFinished: _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock)) # set status dataset.status = finalStatus # update dataset in DB retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) if len(retT) > 0 and retT[0]==1: finalStatusDS += dsList # close user datasets if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \ and (dataset.name.startswith('user') or dataset.name.startswith('group')): # get top-level user dataset topUserDsName = re.sub('_sub\d+$','',dataset.name) # update if it is the first attempt if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi': topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName}) if topUserDs != None: # check status if topUserDs.status in ['completed','cleanup','tobeclosed','deleted', 'tobemerged','merging']: _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status)) else: # set status if self.job.processingType.startswith('gangarobot') or \ self.job.processingType.startswith('hammercloud'): # not trigger freezing for HC datasets so that files can be appended topUserDs.status = 'completed' elif not usingMerger: topUserDs.status = finalStatus else: topUserDs.status = 'merging' # append to avoid repetition topUserDsList.append(topUserDsName) # update DB retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus", criteriaMap={':crStatus':topUserDs.status}) if len(retTopT) > 0 and retTopT[0]==1: _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName)) else: _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName)) # get parent dataset for merge job if self.job.processingType == 'usermerge': tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters) if tmpMatch == None: _logger.error('%s failed to extract parentDS' % self.pandaID) else: unmergedDsName = tmpMatch.group(1) # update if it is the first attempt if not unmergedDsName in topUserDsList: unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName}) if unmergedDs == None: _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName)) else: # check status if unmergedDs.status in ['completed','cleanup','tobeclosed']: _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status)) else: # set status unmergedDs.status = finalStatus # append to avoid repetition topUserDsList.append(unmergedDsName) # update DB retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus", criteriaMap={':crStatus':unmergedDs.status}) if len(retTopT) > 0 and retTopT[0]==1: _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName)) else: _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName)) # start Activator if re.search('_sub\d+$',dataset.name) == None: if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']: # don't trigger Activator for merge jobs pass else: if self.job.jobStatus == 'finished': aThr = Activator(self.taskBuffer,dataset) aThr.start() aThr.join() else: # unset flag since another thread already updated #flagComplete = False pass else: # update dataset in DB self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ", criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'}) # unset flag flagComplete = False # end _logger.debug('%s end %s' % (self.pandaID,destinationDBlock)) # special actions for vo if flagComplete: closerPluginClass = panda_config.getPlugin('closer_plugins',self.job.VO) if closerPluginClass == None and self.job.VO == 'atlas': # use ATLAS plugin for ATLAS from CloserAtlasPlugin import CloserAtlasPlugin closerPluginClass = CloserAtlasPlugin if closerPluginClass != None: closerPlugin = closerPluginClass(self.job,finalStatusDS,_logger) closerPlugin.execute() # change pending jobs to failed finalizedFlag = True if flagComplete and self.job.prodSourceLabel=='user': _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID)) finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID) _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag)) # update unmerged datasets in JEDI to trigger merging if flagComplete and self.job.produceUnMerge() and finalStatusDS != []: if finalizedFlag: tmpStat = self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS) _logger.debug('%s updated unmerged datasets with %s' % (self.pandaID,tmpStat)) # start notifier _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete)) if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \ (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \ self.job.lockedby != 'jedi': # don't send email for merge jobs if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']: useNotifier = True summaryInfo = {} # check all jobDefIDs in jobsetID if not self.job.jobsetID in [0,None,'NULL']: useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID, self.job.prodUserName) _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier)) if useNotifier: _logger.debug('%s start Notifier' % self.pandaID) nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo) nThr.run() _logger.debug('%s end Notifier' % self.pandaID) _logger.debug('%s End' % self.pandaID) except: errType,errValue = sys.exc_info()[:2] _logger.error("%s %s" % (errType,errValue))
def run(self): try: self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr)) # lock XML self.lockXML = open(self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB) except: self.logger.debug("cannot get lock : %s" % self.xmlFile) self.lockXML.close() # remove XML just in case for the final attempt if not self.ignoreTmpError: try: # remove Catalog os.remove(self.xmlFile) except: pass return # check if file exists if not os.path.exists(self.xmlFile): self.logger.debug("not exist : %s" % self.xmlFile) try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: pass return # query job self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False, fromArchived=False, fromWaiting=False, forAnal=True)[0] # check if job has finished if self.job == None: self.logger.debug(': job not found in DB') elif self.job.jobStatus in ['finished','failed','unknown','cancelled','merging']: self.logger.error(': invalid state -> %s' % self.job.jobStatus) elif self.attemptNr != None and self.job.attemptNr != self.attemptNr: self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr)) else: # check file status in JEDI fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job) self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI)) if fileCheckInJEDI == None: raise RuntimeError,'failed to check file status in JEDI' if fileCheckInJEDI == False: # set job status to failed since some file status is wrong in JEDI self.jobStatus = 'failed' self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "wrong file status in JEDI" self.logger.debug("set jobStatus={0} since input are already cancelled in JEDI".format(self.jobStatus)) # keep old status oldJobStatus = self.job.jobStatus # set job status if not self.job.jobStatus in ['transferring']: self.job.jobStatus = self.jobStatus addResult = None adderPlugin = None # parse XML parseResult = self.parseXML() if parseResult < 2: # intraction with DDM try: # set VO=local for DDM free if self.job.destinationSE == 'local': tmpVO = 'local' else: tmpVO = self.job.VO # instantiate concrete plugin adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO) if adderPluginClass == None: # use ATLAS plugin by default from AdderAtlasPlugin import AdderAtlasPlugin adderPluginClass = AdderAtlasPlugin self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__)) adderPlugin = adderPluginClass(self.job, taskBuffer=self.taskBuffer, siteMapper=self.siteMapper, extraInfo=self.extraInfo, logger=self.logger) # execute self.logger.debug('plugin is ready') adderPlugin.execute() addResult = adderPlugin.result self.logger.debug('plugin done with %s' % (addResult.statusCode)) except: errtype,errvalue = sys.exc_info()[:2] self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(tmpVO, errtype, errvalue)) addResult = None self.job.ddmErrorCode = ErrorCode.EC_Adder self.job.ddmErrorDiag = "AdderPlugin failure" # ignore temporary errors if self.ignoreTmpError and addResult != None and addResult.isTemporary(): self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag) self.logger.debug('escape') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return # failed if addResult == None or not addResult.isSucceeded(): self.job.jobStatus = 'failed' # set file status for failed jobs or failed transferring jobs if self.job.jobStatus == 'failed' or self.jobStatus == 'failed': self.job.jobStatus = 'failed' for file in self.job.Files: if file.type in ['output','log']: if addResult != None and file.lfn in addResult.mergingFiles: file.status = 'merging' else: file.status = 'failed' else: # reset errors self.job.jobDispatcherErrorCode = 0 self.job.jobDispatcherErrorDiag = 'NULL' # set status if addResult != None and addResult.mergingFiles != []: # set status for merging: for file in self.job.Files: if file.lfn in addResult.mergingFiles: file.status = 'merging' self.job.jobStatus = 'merging' # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) elif addResult != None and addResult.transferringFiles != []: # set status for transferring for file in self.job.Files: if file.lfn in addResult.transferringFiles: file.status = 'transferring' self.job.jobStatus = 'transferring' # propagate transition to prodDB self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) else: self.job.jobStatus = 'finished' # endtime if self.job.endTime=='NULL': self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime()) # output size and # of outputs self.job.nOutputDataFiles = 0 self.job.outputFileBytes = 0 for tmpFile in self.job.Files: if tmpFile.type == 'output': self.job.nOutputDataFiles += 1 try: self.job.outputFileBytes += tmpFile.fsize except: pass # protection maxOutputFileBytes = 99999999999 if self.job.outputFileBytes > maxOutputFileBytes: self.job.outputFileBytes = maxOutputFileBytes # set cancelled state if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed': self.job.jobStatus = 'cancelled' # update job self.logger.debug("updating DB") retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus], extraInfo=self.extraInfo) self.logger.debug("retU: %s" % retU) # failed if not retU[0]: self.logger.error('failed to update DB') # unlock XML try: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML") return # setup for closer if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.jobStatus == 'cancelled'): destDBList = [] guidList = [] for file in self.job.Files: # ignore inputs if file.type == 'input': continue # skip pseudo datasets if file.destinationDBlock in ['',None,'NULL']: continue # start closer for output/log datasets if not file.destinationDBlock in destDBList: destDBList.append(file.destinationDBlock) # collect GUIDs if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \ self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \ and file.type == 'output': # extract base LFN since LFN was changed to full LFN for CMS baseLFN = file.lfn.split('/')[-1] guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type, 'checksum':file.checksum,'md5sum':file.md5sum, 'fsize':file.fsize,'scope':file.scope}) if guidList != []: retG = self.taskBuffer.setGUIDs(guidList) if destDBList != []: # start Closer if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap) else: cThr = Closer.Closer(self.taskBuffer,destDBList,self.job) self.logger.debug("start Closer") cThr.start() cThr.join() self.logger.debug("end Closer") self.logger.debug("end") try: # remove Catalog os.remove(self.xmlFile) except: pass # unlock XML if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) self.lockXML.close() except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("except") # unlock XML just in case try: if self.lockXML != None: fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN) except: type, value, traceBack = sys.exc_info() self.logger.debug(": %s %s" % (type,value)) self.logger.debug("cannot unlock XML")