Example #1
0
 def getPluginClass(self, tmpVO):
     # instantiate concrete plugin
     adderPluginClass = panda_config.getPlugin('adder_plugins', tmpVO)
     if adderPluginClass == None:
         # use ATLAS plugin by default
         from AdderAtlasPlugin import AdderAtlasPlugin
         adderPluginClass = AdderAtlasPlugin
     self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
     return adderPluginClass
Example #2
0
 def getPluginClass(self, tmpVO):
     # instantiate concrete plugin
     adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO)
     if adderPluginClass == None:
         # use ATLAS plugin by default
         from AdderAtlasPlugin import AdderAtlasPlugin
         adderPluginClass = AdderAtlasPlugin
     self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
     return adderPluginClass
Example #3
0
 def run(self):
     try:
         # make a message instance
         tmpLog = LogWrapper(_logger,None)
         # run main procedure in the same process
         if not self.forkRun:
             tmpLog.debug('main start')
             tmpLog.debug('firstSubmission={0}'.format(self.firstSubmission))
             # group jobs per VO
             voJobsMap = {}
             ddmFreeJobs = []
             tmpLog.debug('{0} jobs in total'.format(len(self.jobs)))
             for tmpJob in self.jobs:
                 # set VO=local for DDM free 
                 if tmpJob.destinationSE == 'local':
                     tmpVO = 'local'
                 else:
                     tmpVO = tmpJob.VO
                 # make map
                 if not voJobsMap.has_key(tmpVO):
                     voJobsMap[tmpVO] = []
                 voJobsMap[tmpVO].append(tmpJob)
             # loop over all VOs
             for tmpVO,tmpJobList in voJobsMap.iteritems():
                 tmpLog.debug('vo={0} has {1} jobs'.format(tmpVO,len(tmpJobList)))
                 # get plugin
                 setupperPluginClass = panda_config.getPlugin('setupper_plugins',tmpVO)
                 if setupperPluginClass == None:
                     # use ATLAS plug-in by default
                     from SetupperAtlasPlugin import SetupperAtlasPlugin
                     setupperPluginClass = SetupperAtlasPlugin
                 tmpLog.debug('plugin name -> {0}'.format(setupperPluginClass.__name__))
                 try:
                     # make plugin
                     setupperPlugin = setupperPluginClass(self.taskBuffer,self.jobs,tmpLog,
                                                          resubmit=self.resubmit,
                                                          pandaDDM=self.pandaDDM,
                                                          ddmAttempt=self.ddmAttempt,
                                                          onlyTA=self.onlyTA,
                                                          firstSubmission=self.firstSubmission)
                     # run plugin
                     tmpLog.debug('run plugin')
                     setupperPlugin.run()
                     # go forward if not TA
                     if not self.onlyTA:
                         # update jobs
                         tmpLog.debug('update jobs')
                         self.updateJobs(setupperPlugin.jobs+setupperPlugin.jumboJobs,tmpLog)
                         # execute post process
                         tmpLog.debug('post execute plugin')
                         setupperPlugin.postRun()
                     tmpLog.debug('done plugin')
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     tmpLog.error('plugin failed with {0}:{1}'.format(errtype, errvalue))
             tmpLog.debug('main end')
         else:
             tmpLog.debug('fork start')
             # write jobs to file
             import os
             import cPickle as pickle
             outFileName = '%s/set.%s_%s' % (panda_config.logdir,self.jobs[0].PandaID,commands.getoutput('uuidgen'))
             outFile = open(outFileName,'w')
             pickle.dump(self.jobs,outFile)
             outFile.close()
             # run main procedure in another process because python doesn't release memory
             com =  'cd %s > /dev/null 2>&1; export HOME=%s; ' % (panda_config.home_dir_cwd,panda_config.home_dir_cwd)
             com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \
                    (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
                     panda_config.pandaPython_dir,outFileName)
             if self.onlyTA:
                 com += " -t"
             if not self.firstSubmission:
                 com += " -f"
             tmpLog.debug(com)
             # exeute
             status,output = self.taskBuffer.processLimiter.getstatusoutput(com)
             tmpLog.debug("return from main process: %s %s" % (status,output))                
             tmpLog.debug('fork end')
     except:
         errtype,errvalue = sys.exc_info()[:2]
         tmpLog.error('master failed with {0}:{1}'.format(errtype,errvalue))
Example #4
0
    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" %
                              (self.jobStatus, self.attemptNr))
            # lock XML
            self.lockXML = open(self.xmlFile)
            try:
                fcntl.flock(self.lockXML.fileno(),
                            fcntl.LOCK_EX | fcntl.LOCK_NB)
            except:
                self.logger.debug("cannot get lock : %s" % self.xmlFile)
                self.lockXML.close()
                # remove XML just in case for the final attempt
                if not self.ignoreTmpError:
                    try:
                        # remove Catalog
                        os.remove(self.xmlFile)
                    except:
                        pass
                return
            # check if file exists
            if not os.path.exists(self.xmlFile):
                self.logger.debug("not exist : %s" % self.xmlFile)
                try:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                    self.lockXML.close()
                except:
                    pass
                return
            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job == None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in [
                    'finished', 'failed', 'unknown', 'merging'
            ]:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' %
                                  (self.job.attemptNr, self.attemptNr))
            else:
                # check file status in JEDI
                if not self.job.isCancelled(
                ) and not self.job.taskBufferErrorCode in [
                        taskbuffer.ErrorCode.EC_PilotRetried
                ]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(
                        self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(
                        fileCheckInJEDI))
                    if fileCheckInJEDI == None:
                        raise RuntimeError, 'failed to check file status in JEDI'
                    if fileCheckInJEDI == False:
                        # set job status to failed since some file status is wrong in JEDI
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "wrong file status in source database"
                        self.logger.debug(
                            "set jobStatus={0} since input is inconsistent between Panda and JEDI"
                            .format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug(
                            "going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],
                                                             'pilot', '60',
                                                             True)
                        if retClosed[0] == True:
                            self.logger.debug("end")
                            try:
                                # remove Catalog
                                os.remove(self.xmlFile)
                            except:
                                pass
                            # unlock XML
                            if self.lockXML != None:
                                fcntl.flock(self.lockXML.fileno(),
                                            fcntl.LOCK_UN)
                                self.lockXML.close()
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC == None:
                            raise RuntimeError, 'failed to check the cloned job'
                        # failed to lock semaphore
                        if checkJC['lock'] == False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug(
                                "set jobStatus={0} since did not get semaphore for job cloning"
                                .format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if not self.job.jobStatus in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # intraction with DDM
                    try:
                        # set VO=local for DDM free
                        if self.job.destinationSE == 'local':
                            tmpVO = 'local'
                        else:
                            tmpVO = self.job.VO
                        # instantiate concrete plugin
                        adderPluginClass = panda_config.getPlugin(
                            'adder_plugins', tmpVO)
                        if adderPluginClass == None:
                            # use ATLAS plugin by default
                            from AdderAtlasPlugin import AdderAtlasPlugin
                            adderPluginClass = AdderAtlasPlugin
                        self.logger.debug('plugin name {0}'.format(
                            adderPluginClass.__name__))
                        adderPlugin = adderPluginClass(
                            self.job,
                            taskBuffer=self.taskBuffer,
                            siteMapper=self.siteMapper,
                            extraInfo=self.extraInfo,
                            logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' %
                                          (addResult.statusCode))
                    except:
                        errtype, errvalue = sys.exc_info()[:2]
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}:{2}"
                            .format(tmpVO, errtype, errvalue))
                        addResult = None
                        self.job.ddmErrorCode = ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"

                    # ignore temporary errors
                    if self.ignoreTmpError and addResult != None and addResult.isTemporary(
                    ):
                        self.logger.debug(': ignore %s ' %
                                          self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # failed
                    if addResult == None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                    elif self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                    elif self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag

                    _logger.debug(
                        "updatejob has source %s, error_code %s and error_diag %s"
                        % (source, error_code, error_diag))

                    if source and error_code:
                        try:
                            self.logger.debug(
                                "AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, self.job.PandaID, source,
                                error_code, error_diag, self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.debug(
                                "apply_retrial_rules excepted and needs to be investigated (%s)"
                                % (e))

                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output', 'log']:
                            if addResult != None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult != None and addResult.mergingFiles != []:
                        # set status for merging:
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    elif addResult != None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime == 'NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                                     time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled', 'closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs(
                        [self.job],
                        False,
                        oldJobStatusList=[oldJobStatus],
                        extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error('failed to update DB')
                        # unlock XML
                        try:
                            fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                            self.lockXML.close()
                        except:
                            type, value, traceBack = sys.exc_info()
                            self.logger.debug(": %s %s" % (type, value))
                            self.logger.debug("cannot unlock XML")
                        return
                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job)
                            and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['', None, 'NULL']:
                                continue
                            # start closer for output/log datasets
                            if not file.destinationDBlock in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({
                                    'lfn': baseLFN,
                                    'guid': file.GUID,
                                    'type': file.type,
                                    'checksum': file.checksum,
                                    'md5sum': file.md5sum,
                                    'fsize': file.fsize,
                                    'scope': file.scope
                                })
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin != None and hasattr(
                                    adderPlugin, 'datasetMap'
                            ) and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(
                                    self.taskBuffer,
                                    destDBList,
                                    self.job,
                                    datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,
                                                     destDBList, self.job)
                            self.logger.debug("start Closer")
                            cThr.start()
                            cThr.join()
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(
                                self.job.jediTaskID, self.job.PandaID,
                                destDBList)
                            for assJobID, assDBlocks in assDBlockMap.iteritems(
                            ):
                                assJob = self.taskBuffer.peekJobs(
                                    [assJobID],
                                    fromDefined=False,
                                    fromArchived=False,
                                    fromWaiting=False,
                                    forAnal=True)[0]
                                if self.job == None:
                                    self.logger.debug(
                                        ': associated job PandaID={0} not found in DB'
                                        .format(assJobID))
                                else:
                                    cThr = Closer.Closer(
                                        self.taskBuffer, assDBlocks, assJob)
                                    self.logger.debug(
                                        "start Closer for PandaID={0}".format(
                                            assJobID))
                                    cThr.start()
                                    cThr.join()
                                    self.logger.debug(
                                        "end Closer for PandaID={0}".format(
                                            assJobID))
            self.logger.debug("end")
            try:
                # remove Catalog
                os.remove(self.xmlFile)
            except:
                pass
            # unlock XML
            if self.lockXML != None:
                fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                self.lockXML.close()
        except:
            type, value, traceBack = sys.exc_info()
            self.logger.debug(": %s %s" % (type, value))
            self.logger.debug("except")
            # unlock XML just in case
            try:
                if self.lockXML != None:
                    fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
            except:
                type, value, traceBack = sys.exc_info()
                self.logger.debug(": %s %s" % (type, value))
                self.logger.debug("cannot unlock XML")
Example #5
0
 def run(self):
     try:
         # make a message instance
         tmpLog = LogWrapper(_logger, None)
         # run main procedure in the same process
         if not self.forkRun:
             tmpLog.debug('main start')
             tmpLog.debug('firstSubmission={0}'.format(
                 self.firstSubmission))
             # group jobs per VO
             voJobsMap = {}
             ddmFreeJobs = []
             tmpLog.debug('{0} jobs in total'.format(len(self.jobs)))
             for tmpJob in self.jobs:
                 # set VO=local for DDM free
                 if tmpJob.destinationSE == 'local':
                     tmpVO = 'local'
                 else:
                     tmpVO = tmpJob.VO
                 # make map
                 if not voJobsMap.has_key(tmpVO):
                     voJobsMap[tmpVO] = []
                 voJobsMap[tmpVO].append(tmpJob)
             # loop over all VOs
             for tmpVO, tmpJobList in voJobsMap.iteritems():
                 tmpLog.debug('vo={0} has {1} jobs'.format(
                     tmpVO, len(tmpJobList)))
                 # get plugin
                 setupperPluginClass = panda_config.getPlugin(
                     'setupper_plugins', tmpVO)
                 if setupperPluginClass == None:
                     # use ATLAS plug-in by default
                     from SetupperAtlasPlugin import SetupperAtlasPlugin
                     setupperPluginClass = SetupperAtlasPlugin
                 tmpLog.debug('plugin name -> {0}'.format(
                     setupperPluginClass.__name__))
                 try:
                     # make plugin
                     setupperPlugin = setupperPluginClass(
                         self.taskBuffer,
                         self.jobs,
                         tmpLog,
                         resubmit=self.resubmit,
                         pandaDDM=self.pandaDDM,
                         ddmAttempt=self.ddmAttempt,
                         onlyTA=self.onlyTA,
                         firstSubmission=self.firstSubmission)
                     # run plugin
                     tmpLog.debug('run plugin')
                     setupperPlugin.run()
                     # go forward if not TA
                     if not self.onlyTA:
                         # update jobs
                         tmpLog.debug('update jobs')
                         self.updateJobs(
                             setupperPlugin.jobs + setupperPlugin.jumboJobs,
                             tmpLog)
                         # execute post process
                         tmpLog.debug('post execute plugin')
                         setupperPlugin.postRun()
                     tmpLog.debug('done plugin')
                 except:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('plugin failed with {0}:{1}'.format(
                         errtype, errvalue))
             tmpLog.debug('main end')
         else:
             tmpLog.debug('fork start')
             # write jobs to file
             import os
             import cPickle as pickle
             outFileName = '%s/set.%s_%s' % (panda_config.logdir,
                                             self.jobs[0].PandaID,
                                             commands.getoutput('uuidgen'))
             outFile = open(outFileName, 'w')
             pickle.dump(self.jobs, outFile)
             outFile.close()
             # run main procedure in another process because python doesn't release memory
             com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (
                 panda_config.home_dir_cwd, panda_config.home_dir_cwd)
             com += 'source %s; ' % panda_config.glite_source
             com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \
                    (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
                     panda_config.pandaPython_dir,outFileName)
             if self.onlyTA:
                 com += " -t"
             if not self.firstSubmission:
                 com += " -f"
             tmpLog.debug(com)
             # exeute
             status, output = self.taskBuffer.processLimiter.getstatusoutput(
                 com)
             tmpLog.debug("return from main process: %s %s" %
                          (status, output))
             tmpLog.debug('fork end')
     except:
         errtype, errvalue = sys.exc_info()[:2]
         tmpLog.error('master failed with {0}:{1}'.format(
             errtype, errvalue))
Example #6
0
 def run(self):
     try:
         _logger.debug('%s Start %s' % (self.pandaID,self.job.jobStatus))
         flagComplete    = True
         topUserDsList   = []
         usingMerger     = False        
         disableNotifier = False
         firstIndvDS     = True
         finalStatusDS   = []
         for destinationDBlock in self.destinationDBlocks:
             dsList = []
             _logger.debug('%s start %s' % (self.pandaID,destinationDBlock))
             # ignore tid datasets
             if re.search('_tid[\d_]+$',destinationDBlock):
                 _logger.debug('%s skip %s' % (self.pandaID,destinationDBlock))                
                 continue
             # ignore HC datasets
             if re.search('^hc_test\.',destinationDBlock) != None or re.search('^user\.gangarbt\.',destinationDBlock) != None:
                 if re.search('_sub\d+$',destinationDBlock) == None and re.search('\.lib$',destinationDBlock) == None:
                     _logger.debug('%s skip HC %s' % (self.pandaID,destinationDBlock))                
                     continue
             # query dataset
             if self.datasetMap.has_key(destinationDBlock):
                 dataset = self.datasetMap[destinationDBlock]
             else:
                 dataset = self.taskBuffer.queryDatasetWithMap({'name':destinationDBlock})
             if dataset == None:
                 _logger.error('%s Not found : %s' % (self.pandaID,destinationDBlock))
                 flagComplete = False
                 continue
             # skip tobedeleted/tobeclosed 
             if dataset.status in ['cleanup','tobeclosed','completed','deleted']:
                 _logger.debug('%s skip %s due to %s' % (self.pandaID,destinationDBlock,dataset.status))
                 continue
             dsList.append(dataset)
             # sort
             dsList.sort()
             # count number of completed files
             notFinish = self.taskBuffer.countFilesWithMap({'destinationDBlock':destinationDBlock,
                                                            'status':'unknown'})
             if notFinish < 0:
                 _logger.error('%s Invalid DB return : %s' % (self.pandaID,notFinish))
                 flagComplete = False                
                 continue
             # check if completed
             _logger.debug('%s notFinish:%s' % (self.pandaID,notFinish))
             if self.job.destinationSE == 'local' and self.job.prodSourceLabel in ['user','panda']:
                 # close non-DQ2 destinationDBlock immediately
                 finalStatus = 'closed'
             elif self.job.lockedby == 'jedi' and self.isTopLevelDS(destinationDBlock):
                 # set it closed in order not to trigger DDM cleanup. It will be closed by JEDI
                 finalStatus = 'closed'
             elif self.job.prodSourceLabel in ['user'] and "--mergeOutput" in self.job.jobParameters \
                      and self.job.processingType != 'usermerge':
                 # merge output files
                 if firstIndvDS:
                     # set 'tobemerged' to only the first dataset to avoid triggering many Mergers for --individualOutDS
                     finalStatus = 'tobemerged'
                     firstIndvDS = False
                 else:
                     finalStatus = 'tobeclosed'
                 # set merging to top dataset
                 usingMerger = True
                 # disable Notifier
                 disableNotifier = True
             elif self.job.produceUnMerge():
                 finalStatus = 'doing'
             else:
                 # set status to 'tobeclosed' to trigger DQ2 closing
                 finalStatus = 'tobeclosed'
             if notFinish == 0 and EventServiceUtils.isEventServiceMerge(self.job):
                 allInJobsetFinished = self.checkSubDatasetsInJobset()
             else:
                 allInJobsetFinished = True
             if notFinish == 0 and allInJobsetFinished: 
                 _logger.debug('%s set %s to dataset : %s' % (self.pandaID,finalStatus,destinationDBlock))
                 # set status
                 dataset.status = finalStatus
                 # update dataset in DB
                 retT = self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                       criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 if len(retT) > 0 and retT[0]==1:
                     finalStatusDS += dsList
                     # close user datasets
                     if self.job.prodSourceLabel in ['user'] and self.job.destinationDBlock.endswith('/') \
                            and (dataset.name.startswith('user') or dataset.name.startswith('group')):
                         # get top-level user dataset 
                         topUserDsName = re.sub('_sub\d+$','',dataset.name)
                         # update if it is the first attempt
                         if topUserDsName != dataset.name and not topUserDsName in topUserDsList and self.job.lockedby != 'jedi':
                             topUserDs = self.taskBuffer.queryDatasetWithMap({'name':topUserDsName})
                             if topUserDs != None:
                                 # check status
                                 if topUserDs.status in ['completed','cleanup','tobeclosed','deleted',
                                                         'tobemerged','merging']:
                                     _logger.debug('%s skip %s due to status=%s' % (self.pandaID,topUserDsName,topUserDs.status))
                                 else:
                                     # set status
                                     if self.job.processingType.startswith('gangarobot') or \
                                            self.job.processingType.startswith('hammercloud'):
                                         # not trigger freezing for HC datasets so that files can be appended
                                         topUserDs.status = 'completed'
                                     elif not usingMerger:
                                         topUserDs.status = finalStatus
                                     else:
                                         topUserDs.status = 'merging'
                                     # append to avoid repetition
                                     topUserDsList.append(topUserDsName)
                                     # update DB
                                     retTopT = self.taskBuffer.updateDatasets([topUserDs],withLock=True,withCriteria="status<>:crStatus",
                                                                              criteriaMap={':crStatus':topUserDs.status})
                                     if len(retTopT) > 0 and retTopT[0]==1:
                                         _logger.debug('%s set %s to top dataset : %s' % (self.pandaID,topUserDs.status,topUserDsName))
                                     else:
                                         _logger.debug('%s failed to update top dataset : %s' % (self.pandaID,topUserDsName))
                         # get parent dataset for merge job
                         if self.job.processingType == 'usermerge':
                             tmpMatch = re.search('--parentDS ([^ \'\"]+)',self.job.jobParameters)
                             if tmpMatch == None:
                                 _logger.error('%s failed to extract parentDS' % self.pandaID)
                             else:
                                 unmergedDsName = tmpMatch.group(1)
                                 # update if it is the first attempt
                                 if not unmergedDsName in topUserDsList:
                                     unmergedDs = self.taskBuffer.queryDatasetWithMap({'name':unmergedDsName})
                                     if unmergedDs == None:
                                         _logger.error('%s failed to get parentDS=%s from DB' % (self.pandaID,unmergedDsName))
                                     else:
                                         # check status
                                         if unmergedDs.status in ['completed','cleanup','tobeclosed']:
                                             _logger.debug('%s skip %s due to status=%s' % (self.pandaID,unmergedDsName,unmergedDs.status))
                                         else:
                                             # set status
                                             unmergedDs.status = finalStatus
                                             # append to avoid repetition
                                             topUserDsList.append(unmergedDsName)
                                             # update DB
                                             retTopT = self.taskBuffer.updateDatasets([unmergedDs],withLock=True,withCriteria="status<>:crStatus",
                                                                                      criteriaMap={':crStatus':unmergedDs.status})
                                             if len(retTopT) > 0 and retTopT[0]==1:
                                                 _logger.debug('%s set %s to parent dataset : %s' % (self.pandaID,unmergedDs.status,unmergedDsName))
                                             else:
                                                 _logger.debug('%s failed to update parent dataset : %s' % (self.pandaID,unmergedDsName))
                     # start Activator
                     if re.search('_sub\d+$',dataset.name) == None:
                         if self.job.prodSourceLabel=='panda' and self.job.processingType in ['merge','unmerge']:
                             # don't trigger Activator for merge jobs
                             pass
                         else:
                             if self.job.jobStatus == 'finished':
                                 aThr = Activator(self.taskBuffer,dataset)
                                 aThr.start()
                                 aThr.join()
                 else:
                     # unset flag since another thread already updated 
                     #flagComplete = False
                     pass
             else:
                 # update dataset in DB
                 self.taskBuffer.updateDatasets(dsList,withLock=True,withCriteria="status<>:crStatus AND status<>:lockStatus ",
                                                criteriaMap={':crStatus':finalStatus,':lockStatus':'locked'})
                 # unset flag
                 flagComplete = False
             # end
             _logger.debug('%s end %s' % (self.pandaID,destinationDBlock))
         # special actions for vo
         if flagComplete:
             closerPluginClass = panda_config.getPlugin('closer_plugins',self.job.VO)
             if closerPluginClass == None and self.job.VO == 'atlas':
                 # use ATLAS plugin for ATLAS
                 from CloserAtlasPlugin import CloserAtlasPlugin
                 closerPluginClass = CloserAtlasPlugin
             if closerPluginClass != None:
                 closerPlugin = closerPluginClass(self.job,finalStatusDS,_logger)
                 closerPlugin.execute()
         # change pending jobs to failed
         finalizedFlag = True
         if flagComplete and self.job.prodSourceLabel=='user':
             _logger.debug('%s finalize %s %s' % (self.pandaID,self.job.prodUserName,self.job.jobDefinitionID))
             finalizedFlag = self.taskBuffer.finalizePendingJobs(self.job.prodUserName,self.job.jobDefinitionID)
             _logger.debug('%s finalized with %s' % (self.pandaID,finalizedFlag))
         # update unmerged datasets in JEDI to trigger merging
         if flagComplete and self.job.produceUnMerge() and finalStatusDS != []:
             if finalizedFlag:
                 tmpStat = self.taskBuffer.updateUnmergedDatasets(self.job,finalStatusDS)
                 _logger.debug('%s updated unmerged datasets with %s' % (self.pandaID,tmpStat))
         # start notifier
         _logger.debug('%s source:%s complete:%s' % (self.pandaID,self.job.prodSourceLabel,flagComplete))
         if (self.job.jobStatus != 'transferring') and ((flagComplete and self.job.prodSourceLabel=='user') or \
            (self.job.jobStatus=='failed' and self.job.prodSourceLabel=='panda')) and \
            self.job.lockedby != 'jedi':
             # don't send email for merge jobs
             if (not disableNotifier) and not self.job.processingType in ['merge','unmerge']:
                 useNotifier = True
                 summaryInfo = {}
                 # check all jobDefIDs in jobsetID
                 if not self.job.jobsetID in [0,None,'NULL']:
                     useNotifier,summaryInfo = self.taskBuffer.checkDatasetStatusForNotifier(self.job.jobsetID,self.job.jobDefinitionID,
                                                                                             self.job.prodUserName)
                     _logger.debug('%s useNotifier:%s' % (self.pandaID,useNotifier))
                 if useNotifier:
                     _logger.debug('%s start Notifier' % self.pandaID)
                     nThr = Notifier.Notifier(self.taskBuffer,self.job,self.destinationDBlocks,summaryInfo)
                     nThr.run()
                     _logger.debug('%s end Notifier' % self.pandaID)                    
         _logger.debug('%s End' % self.pandaID)
     except:
         errType,errValue = sys.exc_info()[:2]
         _logger.error("%s %s" % (errType,errValue))
Example #7
0
 def run(self):
     try:
         self.logger.debug("new start: %s attemptNr=%s" % (self.jobStatus,self.attemptNr))
         # lock XML
         self.lockXML = open(self.xmlFile)
         try:
             fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_EX|fcntl.LOCK_NB)
         except:
             self.logger.debug("cannot get lock : %s" % self.xmlFile)
             self.lockXML.close()
             # remove XML just in case for the final attempt
             if not self.ignoreTmpError:
                 try:
                     # remove Catalog
                     os.remove(self.xmlFile)
                 except:
                     pass
             return
         # check if file exists
         if not os.path.exists(self.xmlFile):
             self.logger.debug("not exist : %s" % self.xmlFile)
             try:
                 fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                 self.lockXML.close()
             except:
                 pass
             return
         # query job
         self.job = self.taskBuffer.peekJobs([self.jobID],fromDefined=False,
                                             fromArchived=False,
                                             fromWaiting=False,
                                             forAnal=True)[0]
         # check if job has finished
         if self.job == None:
             self.logger.debug(': job not found in DB')
         elif self.job.jobStatus in ['finished','failed','unknown','cancelled','merging']:
             self.logger.error(': invalid state -> %s' % self.job.jobStatus)
         elif self.attemptNr != None and self.job.attemptNr != self.attemptNr:
             self.logger.error('wrong attemptNr -> job=%s <> %s' % (self.job.attemptNr,self.attemptNr))
         else:
             # check file status in JEDI
             fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(self.job)
             self.logger.debug("check file status in JEDI : {0}".format(fileCheckInJEDI))                
             if fileCheckInJEDI == None:
                 raise RuntimeError,'failed to check file status in JEDI'
             if fileCheckInJEDI == False:
                 # set job status to failed since some file status is wrong in JEDI 
                 self.jobStatus = 'failed'
                 self.job.ddmErrorCode = ErrorCode.EC_Adder
                 self.job.ddmErrorDiag = "wrong file status in JEDI"
                 self.logger.debug("set jobStatus={0} since input are already cancelled in JEDI".format(self.jobStatus))
             # keep old status
             oldJobStatus = self.job.jobStatus
             # set job status
             if not self.job.jobStatus in ['transferring']:
                 self.job.jobStatus = self.jobStatus
             addResult = None
             adderPlugin = None
             # parse XML
             parseResult = self.parseXML()
             if parseResult < 2:
                 # intraction with DDM
                 try:
                     # set VO=local for DDM free
                     if self.job.destinationSE == 'local':
                         tmpVO = 'local'
                     else:
                         tmpVO = self.job.VO
                     # instantiate concrete plugin
                     adderPluginClass = panda_config.getPlugin('adder_plugins',tmpVO)
                     if adderPluginClass == None:
                         # use ATLAS plugin by default
                         from AdderAtlasPlugin import AdderAtlasPlugin
                         adderPluginClass = AdderAtlasPlugin
                     self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
                     adderPlugin = adderPluginClass(self.job,
                                                    taskBuffer=self.taskBuffer,
                                                    siteMapper=self.siteMapper,
                                                    extraInfo=self.extraInfo,
                                                    logger=self.logger)
                     # execute
                     self.logger.debug('plugin is ready')
                     adderPlugin.execute()
                     addResult = adderPlugin.result
                     self.logger.debug('plugin done with %s' % (addResult.statusCode))
                 except:
                     errtype,errvalue = sys.exc_info()[:2]
                     self.logger.error("failed to execute AdderPlugin for VO={0} with {1}:{2}".format(tmpVO,
                                                                                                      errtype,
                                                                                                      errvalue)) 
                     addResult = None
                     self.job.ddmErrorCode = ErrorCode.EC_Adder
                     self.job.ddmErrorDiag = "AdderPlugin failure"
                 # ignore temporary errors
                 if self.ignoreTmpError and addResult != None and addResult.isTemporary():
                     self.logger.debug(': ignore %s ' % self.job.ddmErrorDiag)
                     self.logger.debug('escape')
                     # unlock XML
                     try:
                         fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                         self.lockXML.close()
                     except:
                         type, value, traceBack = sys.exc_info()
                         self.logger.debug(": %s %s" % (type,value))
                         self.logger.debug("cannot unlock XML")
                     return
                 # failed
                 if addResult == None or not addResult.isSucceeded():
                     self.job.jobStatus = 'failed'
             # set file status for failed jobs or failed transferring jobs
             if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                 self.job.jobStatus = 'failed'
                 for file in self.job.Files:
                     if file.type in ['output','log']:
                         if addResult != None and file.lfn in addResult.mergingFiles:
                             file.status = 'merging'
                         else:
                             file.status = 'failed'
             else:
                 # reset errors
                 self.job.jobDispatcherErrorCode = 0
                 self.job.jobDispatcherErrorDiag = 'NULL'
                 # set status
                 if addResult != None and addResult.mergingFiles != []:
                     # set status for merging:                        
                     for file in self.job.Files:
                         if file.lfn in addResult.mergingFiles:
                             file.status = 'merging'
                     self.job.jobStatus = 'merging'
                     # propagate transition to prodDB
                     self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                 elif addResult != None and addResult.transferringFiles != []:
                     # set status for transferring
                     for file in self.job.Files:
                         if file.lfn in addResult.transferringFiles:
                             file.status = 'transferring'
                     self.job.jobStatus = 'transferring'
                     # propagate transition to prodDB
                     self.job.stateChangeTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
                 else:
                     self.job.jobStatus = 'finished'
             # endtime
             if self.job.endTime=='NULL':
                 self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',time.gmtime())
             # output size and # of outputs
             self.job.nOutputDataFiles = 0
             self.job.outputFileBytes = 0
             for tmpFile in self.job.Files:
                 if tmpFile.type == 'output':
                     self.job.nOutputDataFiles += 1
                     try:
                         self.job.outputFileBytes += tmpFile.fsize
                     except:
                         pass
             # protection
             maxOutputFileBytes = 99999999999
             if self.job.outputFileBytes > maxOutputFileBytes:
                 self.job.outputFileBytes = maxOutputFileBytes
             # set cancelled state
             if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                 self.job.jobStatus = 'cancelled'
             # update job
             self.logger.debug("updating DB")
             retU = self.taskBuffer.updateJobs([self.job],False,oldJobStatusList=[oldJobStatus],
                                               extraInfo=self.extraInfo)
             self.logger.debug("retU: %s" % retU)
             # failed
             if not retU[0]:
                 self.logger.error('failed to update DB')
                 # unlock XML
                 try:
                     fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
                     self.lockXML.close()                            
                 except:
                     type, value, traceBack = sys.exc_info()
                     self.logger.debug(": %s %s" % (type,value))
                     self.logger.debug("cannot unlock XML")
                 return
             # setup for closer
             if not (EventServiceUtils.isEventServiceJob(self.job) and self.job.jobStatus == 'cancelled'):
                 destDBList = []
                 guidList = []
                 for file in self.job.Files:
                     # ignore inputs
                     if file.type == 'input':
                         continue
                     # skip pseudo datasets
                     if file.destinationDBlock in ['',None,'NULL']:
                         continue
                     # start closer for output/log datasets
                     if not file.destinationDBlock in destDBList:
                         destDBList.append(file.destinationDBlock)
                     # collect GUIDs
                     if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['ptest','rc_test','rucio_test'] and \
                                                               self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                               and file.type == 'output':
                         # extract base LFN since LFN was changed to full LFN for CMS
                         baseLFN = file.lfn.split('/')[-1]
                         guidList.append({'lfn':baseLFN,'guid':file.GUID,'type':file.type,
                                          'checksum':file.checksum,'md5sum':file.md5sum,
                                          'fsize':file.fsize,'scope':file.scope})
                 if guidList != []:
                     retG = self.taskBuffer.setGUIDs(guidList)
                 if destDBList != []:
                     # start Closer
                     if adderPlugin != None and hasattr(adderPlugin,'datasetMap') and adderPlugin.datasetMap != {}:
                         cThr = Closer.Closer(self.taskBuffer,destDBList,self.job,datasetMap=adderPlugin.datasetMap)
                     else:
                         cThr = Closer.Closer(self.taskBuffer,destDBList,self.job)
                     self.logger.debug("start Closer")
                     cThr.start()
                     cThr.join()
                     self.logger.debug("end Closer")
         self.logger.debug("end")
         try:
             # remove Catalog
             os.remove(self.xmlFile)
         except:
             pass
         # unlock XML
         if self.lockXML != None:
             fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
             self.lockXML.close()            
     except:
         type, value, traceBack = sys.exc_info()
         self.logger.debug(": %s %s" % (type,value))
         self.logger.debug("except")
         # unlock XML just in case
         try:
             if self.lockXML != None:
                 fcntl.flock(self.lockXML.fileno(), fcntl.LOCK_UN)
         except:
             type, value, traceBack = sys.exc_info()
             self.logger.debug(": %s %s" % (type,value))
             self.logger.debug("cannot unlock XML")