Esempio n. 1
0
 def __init__(self, taskBuffer, siteMapper, evpFileName, ignoreError):
     self.taskBuffer = taskBuffer
     self.siteMapper = siteMapper
     self.ignoreError = ignoreError
     self.evpFileName = evpFileName
     self.token = datetime.datetime.utcnow().isoformat(' ')
     # logger
     self.logger = LogWrapper(_logger, self.token)
     self.pd2p = DynDataDistributer.DynDataDistributer([],
                                                       self.taskBuffer,
                                                       self.siteMapper,
                                                       token=' ',
                                                       logger=self.logger)
     self.userDatasetName = ''
     self.creationTime = ''
     self.params = ''
     self.lockedBy = ''
     self.evpFile = None
     self.userTaskName = ''
     # message buffer
     self.msgBuffer = []
     self.lineLimit = 100
     # JEDI
     self.jediTaskID = None
     self.prodSourceLabel = None
     self.job_label = None
Esempio n. 2
0
 def __init__(self,
              taskBuffer,
              jobID,
              jobStatus,
              attemptNr,
              ignoreTmpError=True,
              siteMapper=None,
              pid=None,
              prelock_pid=None,
              lock_offset=10):
     self.job = None
     self.jobID = jobID
     self.jobStatus = jobStatus
     self.taskBuffer = taskBuffer
     self.ignoreTmpError = ignoreTmpError
     self.lock_offset = lock_offset
     self.siteMapper = siteMapper
     self.datasetMap = {}
     self.extraInfo = {
         'surl': {},
         'nevents': {},
         'lbnr': {},
         'endpoint': {},
         'guid': {}
     }
     self.attemptNr = attemptNr
     self.pid = pid
     self.prelock_pid = prelock_pid
     self.data = None
     # logger
     self.logger = LogWrapper(_logger, str(self.jobID))
Esempio n. 3
0
 def __init__(self,
              taskBuffer,
              jobID,
              jobStatus,
              xmlFile,
              ignoreTmpError=True,
              siteMapper=None):
     self.job = None
     self.jobID = jobID
     self.jobStatus = jobStatus
     self.taskBuffer = taskBuffer
     self.ignoreTmpError = ignoreTmpError
     self.lockXML = None
     self.siteMapper = siteMapper
     self.attemptNr = None
     self.xmlFile = xmlFile
     self.datasetMap = {}
     self.extraInfo = {
         'surl': {},
         'nevents': {},
         'lbnr': {},
         'endpoint': {},
         'guid': {}
     }
     # exstract attemptNr
     try:
         tmpAttemptNr = self.xmlFile.split('/')[-1].split('_')[-1]
         if re.search('^\d+$', tmpAttemptNr) is not None:
             self.attemptNr = int(tmpAttemptNr)
     except Exception:
         pass
     # logger
     self.logger = LogWrapper(_logger, str(self.jobID))
Esempio n. 4
0
def put_file_recovery_request(req, jediTaskID, dryRun=None):
    if not Protocol.isSecure(req):
        return json.dumps((False, "ERROR : no HTTPS"))
    userName = req.subprocess_env['SSL_CLIENT_S_DN']
    creationTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
    tmpLog = LogWrapper(_logger, 'put_file_recovery_request < jediTaskID={}'.format(jediTaskID))
    tmpLog.debug("start user={}".format(userName))
    # get total size
    try:
        jediTaskID = int(jediTaskID)
        # make filename
        evpFileName = '%s/recov.%s' % (panda_config.cache_dir,str(uuid.uuid4()))
        tmpLog.debug("file={}".format(evpFileName))
        # write
        with open(evpFileName, 'w') as fo:
            data = {"userName": userName,
                    "creationTime": creationTime,
                    "jediTaskID": int(jediTaskID)
                    }
            if dryRun:
                data['dryRun'] = True
            json.dump(data, fo)
    except Exception as e:
        errStr = "cannot put request due to {} ".format(str(e))
        tmpLog.error(errStr + traceback.format_exc())
        return json.dumps((False, errStr))
    tmpLog.debug('done')
    return json.dumps((True, 'request was accepted and will be processed in a few minutes'))
def main(tbuf=None, **kwargs):
    # logger
    tmpLog = LogWrapper(_logger)

    tmpLog.debug("================= start ==================")
    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate MyProxy I/F
    my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface()

    # roles
    if hasattr(panda_config,'proxy_cache_roles'):
        roles = panda_config.proxy_cache_roles.split(',')
    else:
        roles = ['atlas','atlas:/atlas/Role=production','atlas:/atlas/Role=pilot']
    # get users
    sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt'
    varMap = {}
    varMap[':patt'] = '%p%'
    tmpStat,tmpRes = taskBuffer.querySQLS(sql,varMap)
    for realDN, in tmpRes:
        if realDN is None:
            continue
        realDN = CoreUtils.get_bare_dn(realDN, keep_digits=False)
        name = taskBuffer.cleanUserID(realDN)
        # check proxy
        tmpLog.debug("check proxy cache for {}".format(name))
        for role in roles:
            my_proxy_interface_instance.checkProxy(realDN, role=role, name=name)
    tmpLog.debug("done")
Esempio n. 6
0
def delete_checkpoint(req, task_id, sub_id):
    tmpLog = LogWrapper(_logger, 'delete_checkpoint <jediTaskID={0} ID={1}>'.format(task_id, sub_id))
    status = True
    if not Protocol.isSecure(req):
        msg = 'insecure request'
        tmpLog.error(msg)
        status = False
    else:
        tmpLog.debug("start %s" % req.subprocess_env['SSL_CLIENT_S_DN'])
        try:
            fileFullPath = os.path.join(panda_config.cache_dir, get_checkpoint_filename(task_id, sub_id))
            os.remove(fileFullPath)
            msg = 'done'
            tmpLog.debug(msg)
        except Exception as e:
            msg = "failed to delete file due to {0}".format(str(e))
            tmpLog.error(msg)
            status = False
    return json.dumps({'status': status, 'message': msg})
Esempio n. 7
0
class CloserAtlasPlugin:

    # constructor
    def __init__(self,job,datasets,log):
        self.jobSpec = job
        self.datasets = datasets
        self.tmpLog = LogWrapper(log,"{0} CloserAtlasPlugin".format(self.jobSpec.PandaID))


        
    # execute
    def execute(self):
        try:
            # only for production
            if not self.jobSpec.prodSourceLabel in ['managed','test']:
                return True
            # only for urgent or high prio
            if not self.jobSpec.processingType in ['urgent'] and self.jobSpec.currentPriority <= 1000:
                return True
            # close datasets
            for datasetSpec in self.datasets:
                if re.search('_sub\d+$',datasetSpec.name) is None:
                    continue
                if datasetSpec.status != 'tobeclosed':
                    continue
                try:
                    self.tmpLog.debug('immediate close {0}'.format(datasetSpec.name))
                    rucioAPI.closeDataset(datasetSpec.name)
                except Exception:
                    errtype,errvalue = sys.exc_info()[:2]
                    self.tmpLog.warning('failed to close : {0} {1}'.format(errtype,errvalue))
        except Exception:
            errtype,errvalue = sys.exc_info()[:2]
            self.tmpLog.warning('failed to execute : {0} {1}'.format(errtype,errvalue))
        return True
Esempio n. 8
0
 def __call__(self, *args, **kwargs):
     log = LogWrapper(
         _logger,
         'pid={} thr={} {}'.format(os.getpid(),
                                   threading.current_thread().ident,
                                   self.methodName))
     log.debug('start')
     # get lock among children
     i = self.childlock.get()
     # make dict to send it master
     self.commDict[i].update({
         'methodName': self.methodName,
         'args': pickle.dumps(args),
         'kwargs': pickle.dumps(kwargs)
     })
     # send notification to master
     self.comLock[i].release()
     # wait response
     self.resLock[i].acquire()
     res = pickle.loads(self.commDict[i]['res'])
     statusCode = self.commDict[i]['stat']
     # release lock to children
     self.childlock.put(i)
     log.debug('end')
     # return
     if statusCode == 0:
         return res
     else:
         errtype, errvalue = res
         raise RuntimeError("{0}: {1} {2}".format(self.methodName,
                                                  errtype.__name__,
                                                  errvalue))
Esempio n. 9
0
def uploadLog(req, file):
    if not Protocol.isSecure(req):
        return False
    if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']:
        return False
    tmpLog = LogWrapper(_logger, 'uploadLog <{0}>'.format(file.filename))
    tmpLog.debug("start {0}".format(req.subprocess_env['SSL_CLIENT_S_DN']))
    # size check
    sizeLimit = 100 * 1024 * 1024
    # get file size
    contentLength = 0
    try:
        contentLength = long(req.headers_in["content-length"])
    except Exception:
        if "content-length" in req.headers_in:
            tmpLog.error("cannot get CL : %s" %
                         req.headers_in["content-length"])
        else:
            tmpLog.error("no CL")
    tmpLog.debug("size %s" % contentLength)
    if contentLength > sizeLimit:
        errStr = "failed to upload log due to size limit"
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    jediLogDir = '/jedilog'
    retStr = ''
    try:
        fileBaseName = file.filename.split('/')[-1]
        fileFullPath = '{0}{1}/{2}'.format(panda_config.cache_dir, jediLogDir,
                                           fileBaseName)
        # delete old file
        if os.path.exists(fileFullPath):
            os.remove(fileFullPath)
        # write
        fo = open(fileFullPath, 'wb')
        fileContent = file.file.read()
        fo.write(fileContent)
        fo.close()
        tmpLog.debug("written to {0}".format(fileFullPath))
        retStr = 'http://{0}/cache{1}/{2}'.format(getServerHTTP(None),
                                                  jediLogDir, fileBaseName)
    except Exception:
        errtype, errvalue = sys.exc_info()[:2]
        errStr = "failed to write log with {0}:{1}".format(
            errtype.__name__, errvalue)
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    tmpLog.debug("end")
    return retStr
Esempio n. 10
0
def putFile(req, file):
    tmpLog = LogWrapper(_logger, 'putFile-{}'.format(datetime.datetime.utcnow().isoformat('/')))
    if not Protocol.isSecure(req):
        tmpLog.error('No SSL_CLIENT_S_DN')
        return False
    if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']:
        return False
    # user name
    username = CoreUtils.clean_user_id(req.subprocess_env['SSL_CLIENT_S_DN'])
    tmpLog.debug("start %s %s" % (username, file.filename))
    # size check
    fullSizeLimit = 768*1024*1024
    if not file.filename.startswith('sources.'):
        noBuild = True
        sizeLimit = 100*1024*1024
    else:
        noBuild = False
        sizeLimit = fullSizeLimit
    # get file size
    contentLength = 0
    try:
        contentLength = long(req.headers_in["content-length"])
    except Exception:
        if "content-length" in req.headers_in:
            tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"])
        else:
            tmpLog.error("no CL")
    tmpLog.debug("size %s" % contentLength)
    if contentLength > sizeLimit:
        errStr = "ERROR : Upload failure. Exceeded size limit %s>%s." % (contentLength,sizeLimit)
        if noBuild:
            errStr += " Please submit the job without --noBuild/--libDS since those options impose a tighter size limit"
        else:
            errStr += " Please remove redundant files from your workarea"
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    try:
        fileName = file.filename.split('/')[-1]
        fileFullPath = '%s/%s' % (panda_config.cache_dir, fileName)

        # avoid overwriting
        if os.path.exists(fileFullPath):
            # touch
            os.utime(fileFullPath,None)
            # send error message
            errStr = "ERROR : Cannot overwrite file"
            tmpLog.debug('cannot overwrite file %s' % fileName)
            tmpLog.debug("end")
            return errStr
        # write
        fo = open(fileFullPath,'wb')
        fileContent = file.file.read()
        if hasattr(panda_config, 'compress_file_names') and \
                [True for patt in panda_config.compress_file_names.split(',') if re.search(patt, fileName) is not None]:
            fileContent = gzip.compress(fileContent)
        fo.write(fileContent)
        fo.close()
    except Exception:
        errStr = "ERROR : Cannot write file"
        tmpLog.error(errStr)
        tmpLog.debug("end")
        return errStr
    # checksum
    try:
        # decode Footer
        footer = fileContent[-8:]
        checkSum,isize = struct.unpack("II",footer)
        tmpLog.debug("CRC from gzip Footer %s" % checkSum)
    except Exception:
        # calculate on the fly
        """
        import zlib
        checkSum = zlib.adler32(fileContent) & 0xFFFFFFFF
        """
        # use None to avoid delay for now
        checkSum = None
        tmpLog.debug("CRC calculated %s" % checkSum)
    # file size
    fileSize = len(fileContent)
    tmpLog.debug("written dn=%s file=%s size=%s crc=%s" % \
                  (username, fileFullPath, fileSize, checkSum))
    # put file info to DB
    if panda_config.record_sandbox_info:
        to_insert = True
        for patt in IGNORED_SUFFIX:
            if file.filename.endswith(patt):
                to_insert = False
                break
        if not to_insert:
            tmpLog.debug("skipped to insert to DB")
        else:
            statClient,outClient = Client.insertSandboxFileInfo(username,file.filename,
                                                                fileSize,checkSum)
            if statClient != 0 or outClient.startswith("ERROR"):
                tmpLog.error("failed to put sandbox to DB with %s %s" % (statClient,outClient))
                #_logger.debug("putFile : end")
                #return "ERROR : Cannot insert sandbox to DB"
            else:
                tmpLog.debug("inserted sandbox to DB with %s" % outClient)
    tmpLog.debug("end")
    return True
Esempio n. 11
0
import datetime
from pandaserver.taskbuffer.TaskBuffer import taskBuffer
from pandacommon.pandalogger.PandaLogger import PandaLogger
from pandacommon.pandalogger.LogWrapper import LogWrapper
from pandaserver.brokerage.SiteMapper import SiteMapper
from pandaserver.taskbuffer import ErrorCode

# password
from pandaserver.config import panda_config

# logger
_logger = PandaLogger().getLogger('esPreemption')
tmpLog = LogWrapper(_logger)

tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# time limit
timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15)

# get low priority ES jobs per site
sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime "
sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA)
sqlEsJobs += "WHERE prodSourceLabel IN (:label1,:label2) AND eventService=:es "
sqlEsJobs += "AND currentPriority<:prio AND jobStatus=:jobStat "
sqlEsJobs += "ORDER BY currentPriority,PandaID "
Esempio n. 12
0
def main(argv=tuple(), tbuf=None, **kwargs):

    try:
        long
    except NameError:
        long = int

    tmpLog = LogWrapper(_logger, None)

    tmpLog.debug("===================== start =====================")

    # current minute
    currentMinute = datetime.datetime.utcnow().minute

    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate sitemapper
    aSiteMapper = SiteMapper(taskBuffer)

    # delete
    tmpLog.debug("Del session")
    status, retSel = taskBuffer.querySQLS(
        "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {})
    if retSel is not None:
        try:
            maxID = retSel[0][0]
            tmpLog.debug("maxID : %s" % maxID)
            if maxID is not None:
                varMap = {}
                varMap[':maxID'] = maxID
                varMap[':jobStatus1'] = 'activated'
                varMap[':jobStatus2'] = 'waiting'
                varMap[':jobStatus3'] = 'failed'
                varMap[':jobStatus4'] = 'cancelled'
                status, retDel = taskBuffer.querySQLS(
                    "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)",
                    varMap)
        except Exception:
            pass

    # count # of getJob/updateJob in dispatcher's log
    try:
        # don't update when logrotate is running
        timeNow = datetime.datetime.utcnow()
        logRotateTime = timeNow.replace(hour=3,
                                        minute=2,
                                        second=0,
                                        microsecond=0)
        if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \
               (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)):
            tmpLog.debug("skip pilotCounts session for logrotate")
        else:
            # log filename
            dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir
            # time limit
            timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                hours=3)
            timeLimitS = datetime.datetime.utcnow() - datetime.timedelta(
                hours=1)
            # check if tgz is required
            com = 'head -1 %s' % dispLogName
            lostat, loout = commands_get_status_output(com)
            useLogTgz = True
            if lostat == 0:
                match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                  loout)
                if match is not None:
                    startTime = datetime.datetime(*time.strptime(
                        match.group(0), '%Y-%m-%d %H:%M:%S')[:6])
                    # current log contains all info
                    if startTime < timeLimit:
                        useLogTgz = False
            # log files
            dispLogNameList = [dispLogName]
            if useLogTgz:
                today = datetime.date.today()
                dispLogNameList.append('{0}-{1}.gz'.format(
                    dispLogName, today.strftime('%Y%m%d')))
            # delete tmp
            commands_get_status_output('rm -f %s.tmp-*' % dispLogName)
            # tmp name
            tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow(
            ).strftime('%Y-%m-%d-%H-%M-%S'))
            # loop over all files
            pilotCounts = {}
            pilotCountsS = {}
            for tmpDispLogName in dispLogNameList:
                # expand or copy
                if tmpDispLogName.endswith('.gz'):
                    com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName)
                else:
                    com = 'cp %s %s' % (tmpDispLogName, tmpLogName)
                lostat, loout = commands_get_status_output(com)
                if lostat != 0:
                    errMsg = 'failed to expand/copy %s with : %s' % (
                        tmpDispLogName, loout)
                    raise RuntimeError(errMsg)
                # search string
                sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*'
                sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)'
                # read
                logFH = open(tmpLogName)
                for line in logFH:
                    # check format
                    match = re.search(sStr, line)
                    if match is not None:
                        # check timerange
                        timeStamp = datetime.datetime(*time.strptime(
                            match.group(1), '%Y-%m-%d %H:%M:%S')[:6])
                        if timeStamp < timeLimit:
                            continue
                        tmpMethod = match.group(2)
                        tmpSite = match.group(3)
                        tmpNode = match.group(4)
                        tmpType = match.group(5)

                        # protection against corrupted entries from pilot,
                        # e.g. pilot reading site json from cvmfs while it was being updated
                        if tmpSite not in aSiteMapper.siteSpecList:
                            continue
                        # sum
                        pilotCounts.setdefault(tmpSite, {})
                        pilotCounts[tmpSite].setdefault(tmpMethod, {})
                        pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0)
                        pilotCounts[tmpSite][tmpMethod][tmpNode] += 1
                        # short
                        if timeStamp > timeLimitS:
                            if tmpSite not in pilotCountsS:
                                pilotCountsS[tmpSite] = dict()
                            if tmpMethod not in pilotCountsS[tmpSite]:
                                pilotCountsS[tmpSite][tmpMethod] = dict()
                            if tmpNode not in pilotCountsS[tmpSite][tmpMethod]:
                                pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0
                            pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1
                # close
                logFH.close()
            # delete tmp
            commands_get_status_output('rm %s' % tmpLogName)
            # update
            hostID = panda_config.pserverhost.split('.')[0]
            tmpLog.debug("pilotCounts session")
            retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3)
            tmpLog.debug(retPC)
            retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1)
            tmpLog.debug(retPC)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue))

    # nRunning
    tmpLog.debug("nRunning session")
    try:
        if (currentMinute / panda_config.nrun_interval
            ) % panda_config.nrun_hosts == panda_config.nrun_snum:
            retNR = taskBuffer.insertnRunningInSiteData()
            tmpLog.debug(retNR)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("nRunning : %s %s" % (errType, errValue))

    # session for co-jumbo jobs
    tmpLog.debug("co-jumbo session")
    try:
        ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000)
        if ret is None:
            tmpLog.debug("failed to get co-jumbo jobs to finish")
        else:
            coJumboA, coJumboD, coJumboW, coJumboTokill = ret
            tmpLog.debug("finish {0} co-jumbo jobs in Active".format(
                len(coJumboA)))
            if len(coJumboA) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboA,
                                               fromDefined=False,
                                               fromActive=True,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False)
            tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(
                len(coJumboD)))
            if len(coJumboD) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboD,
                                               fromDefined=True,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], True)
            tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format(
                len(coJumboW)))
            if len(coJumboW) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboW,
                                               fromDefined=False,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=True)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False, True)
            tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format(
                len(coJumboTokill)))
            if len(coJumboTokill) > 0:
                jediJobs = list(coJumboTokill)
                nJob = 100
                iJob = 0
                while iJob < len(jediJobs):
                    tmpLog.debug(' killing %s' %
                                 str(jediJobs[iJob:iJob + nJob]))
                    Client.killJobs(jediJobs[iJob:iJob + nJob],
                                    51,
                                    keepUnmerged=True)
                    iJob += nJob
    except Exception:
        errStr = traceback.format_exc()
        tmpLog.error(errStr)

    tmpLog.debug("Fork session")

    # thread for fork
    class ForkThr(threading.Thread):
        def __init__(self, fileName):
            threading.Thread.__init__(self)
            self.fileName = fileName

        def run(self):
            if 'VIRTUAL_ENV' in os.environ:
                prefix = os.environ['VIRTUAL_ENV']
            else:
                prefix = ''
            setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix)
            runStr = '%s/python -Wignore ' % panda_config.native_python
            runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i '
            runStr += self.fileName
            if self.fileName.split('/')[-1].startswith('set.NULL.'):
                runStr += ' -t'
            comStr = setupStr + runStr
            tmpLog.debug(comStr)
            commands_get_status_output(comStr)

    # get set.* files
    filePatt = panda_config.logdir + '/' + 'set.*'
    fileList = glob.glob(filePatt)

    # the max number of threads
    maxThr = 10
    nThr = 0

    # loop over all files
    forkThrList = []
    timeNow = datetime.datetime.utcnow()
    for tmpName in fileList:
        if not os.path.exists(tmpName):
            continue
        try:
            # takes care of only recent files
            modTime = datetime.datetime(
                *(time.gmtime(os.path.getmtime(tmpName))[:7]))
            if (timeNow - modTime) > datetime.timedelta(minutes=1) and \
                    (timeNow - modTime) < datetime.timedelta(hours=1):
                cSt, cOut = commands_get_status_output(
                    'ps aux | grep fork | grep -v PYTH')
                # if no process is running for the file
                if cSt == 0 and tmpName not in cOut:
                    nThr += 1
                    thr = ForkThr(tmpName)
                    thr.start()
                    forkThrList.append(thr)
                    if nThr > maxThr:
                        break
        except Exception:
            errType, errValue = sys.exc_info()[:2]
            tmpLog.error("%s %s" % (errType, errValue))

    # join fork threads
    for thr in forkThrList:
        thr.join()

    # terminate TaskBuffer IF
    # taskBufferIF.terminate()

    tmpLog.debug("===================== end =====================")
Esempio n. 13
0
import re
from pandaserver.config import panda_config

from pandaserver.taskbuffer.TaskBuffer import taskBuffer

from pandacommon.pandalogger.PandaLogger import PandaLogger
from pandacommon.pandalogger.LogWrapper import LogWrapper

from pandaserver.proxycache import panda_proxy_cache

# logger
_logger = PandaLogger().getLogger('panda_activeusers_query')
tmpLog = LogWrapper(_logger)

if __name__ == '__main__':

    tmpLog.debug("================= start ==================")
    # instantiate TB
    taskBuffer.init(panda_config.dbhost,
                    panda_config.dbpasswd,
                    nDBConnection=1)

    # instantiate MyProxy I/F
    my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface()

    # roles
    if hasattr(panda_config, 'proxy_cache_roles'):
        roles = panda_config.proxy_cache_roles.split(',')
    else:
        roles = [
            'atlas', 'atlas:/atlas/Role=production', 'atlas:/atlas/Role=pilot'
Esempio n. 14
0
 def run(self):
     # get logger
     tmpLog = LogWrapper(_logger,'<vuid={0} site={1} name={2}>'.format(self.vuid,
                                                                       self.site,
                                                                       self.dataset))
     # query dataset
     tmpLog.debug("start")
     if self.vuid is not None:
         dataset = self.taskBuffer.queryDatasetWithMap({'vuid':self.vuid})
     else:
         dataset = self.taskBuffer.queryDatasetWithMap({'name':self.dataset})
     if dataset is None:
         tmpLog.error("Not found")
         tmpLog.debug("end")
         return
     tmpLog.debug("type:%s name:%s" % (dataset.type,dataset.name))
     if dataset.type == 'dispatch':
         # activate jobs in jobsDefined
         Activator(self.taskBuffer,dataset).start()
     if dataset.type == 'output':
         if dataset.name is not None and re.search('^panda\..*_zip$',dataset.name) is not None:
             # start unmerge jobs
             Activator(self.taskBuffer,dataset,enforce=True).start()
         else:
             # finish transferring jobs
             Finisher(self.taskBuffer,dataset,site=self.site).start()
     tmpLog.debug("end")
Esempio n. 15
0
 def getGUIDsFromEventIndex(self, runEventList, streamName, amiTags,
                            dataType):
     comment = ' /* DBProxy.getGUIDsFromEventIndex */'
     methodName = comment.split(' ')[-2].split('.')[-1]
     tmpLog = LogWrapper(
         _logger,
         methodName + " <streamName={0} amiTags={1} dataType={2}>".format(
             streamName, amiTags, dataType))
     try:
         # change to list
         if not amiTags in [None, '']:
             amiTags = amiTags.replace('*', '.*').split(',')
         tmpLog.debug("start for {0} events".format(len(runEventList)))
         # check data type
         if not dataType in ['RAW', 'ESD', 'AOD']:
             return False, 'dataType={0} is unsupported'.format(dataType)
         # sql to insert runs and events
         sqlRE = "INSERT INTO {0}.TMP_RUN_EVENT_PAIRS (runNumber,eventNumber) ".format(
             panda_config.schemaEI)
         sqlRE += "VALUES (:runNumber,:eventNumber) "
         varMaps = []
         for runNumber, eventNumber in runEventList:
             varMap = {}
             varMap[':runNumber'] = runNumber
             varMap[':eventNumber'] = eventNumber
             varMaps.append(varMap)
         # begin transaction
         self.conn.begin()
         self.cur.arraysize = 100000
         # insert runs and events
         self.cur.executemany(sqlRE + comment, varMaps)
         # read GUIDs
         varMap = {}
         if amiTags in [None, '']:
             sqlRG = "SELECT runNumber,eventNumber,guid_{0} ".format(
                 dataType)
             sqlRG += "FROM {0}.V_PANDA_EVPICK_NOAMITAG_MANY ".format(
                 panda_config.schemaEI)
         else:
             sqlRG = "SELECT runNumber,eventNumber,guid_{0},amiTag ".format(
                 dataType)
             sqlRG += "FROM {0}.V_PANDA_EVPICK_AMITAG_MANY ".format(
                 panda_config.schemaEI)
         if not streamName in [None, '']:
             sqlRG += "WHERE streamName=:streamName "
             varMap[':streamName'] = streamName
         self.cur.execute(sqlRG + comment, varMap)
         resRG = self.cur.fetchall()
         # commit
         if not self._commit():
             raise RuntimeError('Commit error')
         retValue = {}
         keyAmiIdxMap = {}
         for tmpItem in resRG:
             if amiTags in [None, '']:
                 runNumber, eventNumber, guid = tmpItem
                 # dummy
                 idxTag = 0
             else:
                 runNumber, eventNumber, guid, amiTag = tmpItem
                 # get index number for the AMI tag in the list
                 idxTag = self.getIndexAmiTag(amiTags, amiTag)
                 # didn't match
                 if idxTag is None:
                     continue
             tmpKey = (runNumber, eventNumber)
             # use AMI tag in a preference orde
             if tmpKey in keyAmiIdxMap and keyAmiIdxMap[tmpKey] < idxTag:
                 continue
             keyAmiIdxMap[tmpKey] = idxTag
             retValue[tmpKey] = [guid]
         tmpLog.debug("found {0} events".format(len(retValue)))
         return True, retValue
     except Exception:
         # roll back
         self._rollback()
         # error
         self.dumpErrorMessage(_logger, methodName)
         return False, None
Esempio n. 16
0
def main(argv=tuple(), tbuf=None, **kwargs):

    try:
        long
    except NameError:
        long = int

    prelock_pid = GenericThread().get_pid()
    tmpLog = LogWrapper(_logger, "<pid={}>".format(prelock_pid))

    tmpLog.debug("===================== start =====================")

    # return value, true to run main again in next daemon loop
    ret_val = True

    # grace period
    try:
        gracePeriod = int(argv[1])
    except Exception:
        gracePeriod = 1

    # lock interval in minutes
    lock_interval = 10

    # retry interval in minutes
    retry_interval = 3

    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate sitemapper
    aSiteMapper = SiteMapper(taskBuffer)

    # thread for adder
    class AdderThread(GenericThread):
        def __init__(self, taskBuffer, aSiteMapper, job_output_reports):
            GenericThread.__init__(self)
            self.taskBuffer = taskBuffer
            self.aSiteMapper = aSiteMapper
            self.job_output_reports = job_output_reports

        # main loop
        def run(self):
            # initialize
            taskBuffer = self.taskBuffer
            aSiteMapper = self.aSiteMapper
            # get file list
            timeNow = datetime.datetime.utcnow()
            timeInt = datetime.datetime.utcnow()
            # unique pid
            GenericThread.__init__(self)
            uniq_pid = self.get_pid()
            # log pid
            tmpLog.debug("pid={0} : run".format(uniq_pid))
            # stats
            n_processed = 0
            # loop
            while True:
                # get report
                one_jor = self.job_output_reports.pop()
                if not one_jor:
                    break
                # lock
                panda_id, job_status, attempt_nr, time_stamp = one_jor
                got_lock = taskBuffer.lockJobOutputReport(
                    panda_id=panda_id,
                    attempt_nr=attempt_nr,
                    pid=uniq_pid,
                    time_limit=lock_interval)
                if not got_lock:
                    continue
                # add
                try:
                    modTime = time_stamp
                    if (timeNow - modTime) > datetime.timedelta(hours=24):
                        # last add
                        tmpLog.debug(
                            "pid={0} : last add job={1}.{2} st={3}".format(
                                uniq_pid, panda_id, attempt_nr, job_status))
                        ignoreTmpError = False
                    else:
                        # usual add
                        tmpLog.debug("pid={0} : add job={1}.{2} st={3}".format(
                            uniq_pid, panda_id, attempt_nr, job_status))
                        ignoreTmpError = True
                    # get adder
                    adder_gen = AdderGen(taskBuffer,
                                         panda_id,
                                         job_status,
                                         attempt_nr,
                                         ignoreTmpError=ignoreTmpError,
                                         siteMapper=aSiteMapper,
                                         pid=uniq_pid,
                                         prelock_pid=uniq_pid,
                                         lock_offset=lock_interval -
                                         retry_interval)
                    n_processed += 1
                    # execute
                    adder_gen.run()
                    del adder_gen
                except Exception as e:
                    tmpLog.error("pid={} : failed to run with {} {}".format(
                        uniq_pid, str(e), traceback.format_exc()))
            # stats
            tmpLog.debug("pid={} : processed {}".format(uniq_pid, n_processed))

        # launcher, run with multiprocessing
        def proc_launch(self):
            # run
            self.process = multiprocessing.Process(target=self.run)
            self.process.start()

        # join of multiprocessing
        def proc_join(self):
            self.process.join()

    # TaskBuffer with more connections behind TaskBufferInterface
    tmpLog.debug("setup taskBufferIF")
    n_connections = 4
    _tbuf = TaskBuffer()
    _tbuf.init(panda_config.dbhost,
               panda_config.dbpasswd,
               nDBConnection=n_connections)
    taskBufferIF = TaskBufferInterface()
    taskBufferIF.launch(_tbuf)

    # add files
    tmpLog.debug("run Adder")

    interval = 10
    nLoop = 10
    for iLoop in range(10):
        tmpLog.debug('start iLoop={}/{}'.format(iLoop, nLoop))
        start_time = datetime.datetime.utcnow()
        adderThrList = []
        nThr = 10

        n_jors_per_batch = 1000

        jor_lists = WeightedLists(multiprocessing.Lock())

        # get some job output reports
        jor_list_others = taskBuffer.listJobOutputReport(
            only_unlocked=True,
            time_limit=lock_interval,
            limit=n_jors_per_batch * nThr,
            grace_period=gracePeriod,
            anti_labels=['user'])
        jor_lists.add(3, jor_list_others)
        jor_list_user = taskBuffer.listJobOutputReport(
            only_unlocked=True,
            time_limit=lock_interval,
            limit=n_jors_per_batch * nThr,
            grace_period=gracePeriod,
            labels=['user'])
        jor_lists.add(7, jor_list_user)

        # adder consumer processes
        _n_thr_with_tbuf = 0
        tbuf_list = []
        tmpLog.debug("got {} job reports".format(len(jor_lists)))
        for i in range(nThr):
            if i < _n_thr_with_tbuf:
                tbuf = TaskBuffer()
                tbuf_list.append(tbuf)
                tbuf.init(panda_config.dbhost,
                          panda_config.dbpasswd,
                          nDBConnection=1)
                thr = AdderThread(tbuf, aSiteMapper, jor_lists)
            else:
                thr = AdderThread(taskBufferIF.getInterface(), aSiteMapper,
                                  jor_lists)
            adderThrList.append(thr)
        # start all threads
        for thr in adderThrList:
            # thr.start()
            thr.proc_launch()
            time.sleep(0.25)

        # join all threads
        for thr in adderThrList:
            # thr.join()
            thr.proc_join()
        [tbuf.cleanup() for tbuf in tbuf_list]
        end_time = datetime.datetime.utcnow()
        sleep_time = interval - (end_time - start_time).seconds
        if sleep_time > 0 and iLoop + 1 < nLoop:
            sleep_time = random.randint(1, sleep_time)
            tmpLog.debug("sleep {} sec".format(sleep_time))
            time.sleep(sleep_time)

    # stop TaskBuffer IF
    taskBufferIF.stop()

    tmpLog.debug("===================== end =====================")

    # return
    return ret_val
Esempio n. 17
0
def core_exec(sandbox_url, log_token, dump_workflow, ops_file, user_name, test_mode):
    tmpLog = LogWrapper(_logger, log_token)
    is_OK = True
    is_fatal = False
    request_id = None
    if dump_workflow == 'True':
        dump_workflow = True
    else:
        dump_workflow = False
    if test_mode == 'True':
        test_mode = True
    else:
        test_mode = False
    try:
        with open(ops_file) as f:
            ops = json.load(f)
        try:
            os.remove(ops_file)
        except Exception:
            pass
        # go to temp dir
        cur_dir = os.getcwd()
        with tempfile.TemporaryDirectory() as tmp_dirname:
            os.chdir(tmp_dirname)
            # download sandbox
            tmpLog.info('downloading sandbox from {}'.format(sandbox_url))
            with requests.get(sandbox_url, allow_redirects=True, verify=False, stream=True) as r:
                if r.status_code == 400:
                    tmpLog.error("not found")
                    is_fatal = True
                    is_OK = False
                elif r.status_code != 200:
                    tmpLog.error("bad HTTP response {}".format(r.status_code))
                    is_OK = False
                # extract sandbox
                if is_OK:
                    with open(ops['data']['sandbox'], 'wb') as fs:
                        for chunk in r.raw.stream(1024, decode_content=False):
                            if chunk:
                                fs.write(chunk)
                        fs.close()
                        tmp_stat, tmp_out = commands_get_status_output(
                            'tar xvfz {}'.format(ops['data']['sandbox']))
                        if tmp_stat != 0:
                            tmpLog.error(tmp_out)
                            dump_str = 'failed to extract {}'.format(ops['data']['sandbox'])
                            tmpLog.error(dump_str)
                            is_fatal = True
                            is_OK = False
                # parse workflow files
                if is_OK:
                    tmpLog.info('parse workflow')
                    if ops['data']['language'] == 'cwl':
                        nodes, root_in = pcwl_utils.parse_workflow_file(ops['data']['workflowSpecFile'],
                                                                        tmpLog)
                        with open(ops['data']['workflowInputFile']) as workflow_input:
                            data = yaml.safe_load(workflow_input)
                        s_id, t_nodes, nodes = pcwl_utils.resolve_nodes(nodes, root_in, data, 0, set(),
                                                                        ops['data']['outDS'], tmpLog)
                        workflow_utils.set_workflow_outputs(nodes)
                        id_node_map = workflow_utils.get_node_id_map(nodes)
                        [node.resolve_params(ops['data']['taskParams'], id_node_map) for node in nodes]
                        dump_str = "the description was internally converted as follows\n" \
                                   + workflow_utils.dump_nodes(nodes)
                        tmpLog.info(dump_str)
                        for node in nodes:
                            s_check, o_check = node.verify()
                            tmp_str = 'Verification failure in ID:{} {}'.format(node.id, o_check)
                            if not s_check:
                                tmpLog.error(tmp_str)
                                dump_str += tmp_str
                                dump_str += '\n'
                                is_fatal = True
                                is_OK = False
                    else:
                        dump_str = "{} is not supported to describe the workflow"
                        tmpLog.error(dump_str)
                        is_fatal = True
                        is_OK = False
                    # convert to workflow
                    if is_OK:
                        workflow_to_submit, dump_str_list = workflow_utils.convert_nodes_to_workflow(nodes)
                        try:
                            if workflow_to_submit:
                                if not test_mode:
                                    tmpLog.info('submit workflow')
                                    wm = ClientManager(host=get_rest_host())
                                    request_id = wm.submit(workflow_to_submit, username=user_name)
                            else:
                                dump_str = 'workflow is empty'
                                tmpLog.error(dump_str)
                                is_fatal = True
                                is_OK = False
                        except Exception as e:
                            dump_str = 'failed to submit the workflow with {}'.format(str(e))
                            tmpLog.error('{} {}'.format(dump_str, traceback.format_exc()))
                        if dump_workflow:
                            tmpLog.debug('\n' + ''.join(dump_str_list))
        os.chdir(cur_dir)
    except Exception as e:
        is_OK = False
        is_fatal = True
        tmpLog.error("failed to run with {} {}".format(str(e), traceback.format_exc()))

    with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_json:
        json.dump([is_OK, is_fatal, request_id, tmpLog.dumpToString()], tmp_json)
        print(tmp_json.name)
    sys.exit(0)
Esempio n. 18
0
def put_workflow_request(req, data, check=False):
    if not Protocol.isSecure(req):
        return json.dumps((False, "ERROR : no HTTPS"))
    userName = req.subprocess_env['SSL_CLIENT_S_DN']
    creationTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
    tmpLog = LogWrapper(_logger, 'put_workflow_request')
    tmpLog.debug("start user={} check={}".format(userName, check))
    if check == 'True' or check is True:
        check = True
    else:
        check = False
    # get total size
    try:
        # make filename
        evpFileName = '%s/workflow.%s' % (panda_config.cache_dir,str(uuid.uuid4()))
        tmpLog.debug("file={}".format(evpFileName))
        # write
        with open(evpFileName, 'w') as fo:
            data = {"userName": userName,
                    "creationTime": creationTime,
                    "data": json.loads(data),
                    }
            json.dump(data, fo)
        # check
        if check:
            tmpLog.debug('checking')
            from pandaserver.taskbuffer.workflow_processor import WorkflowProcessor
            processor = WorkflowProcessor(log_stream=_logger)
            ret = processor.process(evpFileName, True, True, True, True)
            if os.path.exists(evpFileName):
                try:
                    os.remove(evpFileName)
                except Exception:
                    pass
            tmpLog.debug('done')
            return json.dumps((True, ret))
    except Exception as e:
        errStr = "cannot put request due to {} ".format(str(e))
        tmpLog.error(errStr + traceback.format_exc())
        return json.dumps((False, errStr))
    tmpLog.debug('done')
    return json.dumps((True, 'request was accepted and will be processed in a few minutes'))
Esempio n. 19
0
 def process(self, file_name, to_delete=False, test_mode=False, get_log=False, dump_workflow=False):
     try:
         is_fatal = False
         is_OK = True
         request_id = None
         dump_str = None
         with open(file_name) as f:
             ops = json.load(f)
             user_name = clean_user_id(ops["userName"])
             base_platform = ops['data'].get('base_platform')
             for task_type in ops['data']['taskParams']:
                 ops['data']['taskParams'][task_type]['userName'] = user_name
                 if base_platform:
                     ops['data']['taskParams'][task_type]['basePlatform'] = base_platform
             log_token = '< id="{}" test={} outDS={} >'.format(user_name, test_mode, ops['data']['outDS'])
             tmpLog = LogWrapper(self.log, log_token)
             tmpLog.info('start {}'.format(file_name))
             sandbox_url = os.path.join(ops['data']['sourceURL'], 'cache', ops['data']['sandbox'])
             # IO through json files
             ops_file = tempfile.NamedTemporaryFile(delete=False, mode='w')
             json.dump(ops, ops_file)
             ops_file.close()
             # execute main in another process to avoid chdir mess
             tmp_stat, tmp_out = commands_get_status_output("python {} {} '{}' {} {} '{}' {}".format(
                 __file__, sandbox_url, log_token, dump_workflow, ops_file.name,
                 user_name, test_mode))
             if tmp_stat:
                 is_OK = False
                 tmpLog.error('main execution failed with {}:{}'.format(tmp_stat, tmp_out))
             else:
                 with open(tmp_out.split('\n')[-1]) as tmp_out_file:
                     is_OK, is_fatal, request_id, dump_str = json.load(tmp_out_file)
                 try:
                     os.remove(tmp_out)
                 except Exception:
                     pass
             if not get_log:
                 if is_OK:
                     tmpLog.info('is_OK={} request_id={}'.format(is_OK, request_id))
                 else:
                     tmpLog.info('is_OK={} is_fatal={} request_id={}'.format(is_OK, is_fatal, request_id))
             if to_delete or (not test_mode and (is_OK or is_fatal)):
                 dump_str = tmpLog.dumpToString() + dump_str
                 tmpLog.debug('delete {}'.format(file_name))
                 try:
                     os.remove(file_name)
                 except Exception:
                     pass
                 # send notification
                 if not test_mode and self.taskBuffer is not None:
                     toAdder = self.taskBuffer.getEmailAddr(user_name)
                     if toAdder is None or toAdder.startswith('notsend'):
                         tmpLog.debug('skip to send notification since suppressed')
                     else:
                         # message
                         if is_OK:
                             mailSubject = "PANDA Notification for Workflow {}".format(ops['data']['outDS'])
                             mailBody = "Hello,\n\nWorkflow:{} has been accepted with RequestID:{}\n\n".\
                                 format(ops['data']['outDS'], request_id)
                         else:
                             mailSubject = "PANDA WARNING for Workflow={}".format(ops['data']['outDS'])
                             mailBody = "Hello,\n\nWorkflow {} was not accepted\n\n".\
                                 format(ops['data']['outDS'], request_id)
                             mailBody += "Reason : %s\n" % dump_str
                         # send
                         tmpSM = MailUtils().send(toAdder, mailSubject, mailBody)
                         tmpLog.debug('sent message with {}'.format(tmpSM))
     except Exception as e:
         is_OK = False
         tmpLog.error("failed to run with {} {}".format(str(e), traceback.format_exc()))
     if get_log:
         ret_val = {'status': is_OK}
         if is_OK:
             ret_val['log'] = dump_str
         else:
             if dump_str is None:
                 ret_val['log'] = tmpLog.dumpToString()
             else:
                 ret_val['log'] = dump_str
         return ret_val
Esempio n. 20
0
def put_checkpoint(req, file):
    tmpLog = LogWrapper(_logger, 'put_checkpoint <jediTaskID_subID={0}>'.format(file.filename))
    status = False
    if not Protocol.isSecure(req):
        errStr = 'insecure request'
        tmpLog.error(errStr)
        return json.dumps({'status': status, 'message': errStr})
    tmpLog.debug("start %s" % req.subprocess_env['SSL_CLIENT_S_DN'])
    # extract taskID and subID
    try:
        task_id, sub_id = file.filename.split('/')[-1].split('_')
    except Exception:
        errStr = 'failed to extract ID'
        tmpLog.error(errStr)
        return json.dumps({'status': status, 'message': errStr})
    # size check
    sizeLimit = 500 * 1024 * 1024
    # get file size
    try:
        contentLength = long(req.headers_in["content-length"])
    except Exception as e:
        errStr = "cannot get int(content-length) due to {0}".format(str(e))
        tmpLog.error(errStr)
        return json.dumps({'status': status, 'message': errStr})
    tmpLog.debug("size %s" % contentLength)
    if contentLength > sizeLimit:
        errStr = "exceeded size limit %s>%s" % (contentLength, sizeLimit)
        tmpLog.error(errStr)
        return json.dumps({'status': status, 'message': errStr})
    try:
        fileFullPath = os.path.join(panda_config.cache_dir, get_checkpoint_filename(task_id, sub_id))
        # write
        with open(fileFullPath,'wb') as fo:
            fo.write(file.file.read())
    except Exception as e:
        errStr = "cannot write file due to {0}".format(str(e))
        tmpLog.error(errStr)
        return json.dumps({'status': status, 'message': errStr})
    status = True
    tmpMsg = "successfully placed at {0}".format(fileFullPath)
    tmpLog.debug(tmpMsg)
    return json.dumps({'status': status, 'message': tmpMsg})
Esempio n. 21
0
 def application(environ, start_response):
     # get method name
     methodName = ''
     if 'SCRIPT_NAME' in environ:
         methodName = environ['SCRIPT_NAME'].split('/')[-1]
     tmpLog = LogWrapper(_logger, "PID={0} {1}".format(os.getpid(), methodName))
     tmpLog.debug("start")
     regStart = datetime.datetime.utcnow()
     retType = None
     # check method name    
     if not methodName in allowedMethods:
         tmpLog.error("is forbidden")
         exeRes = "False : %s is forbidden" % methodName
     else:
         # get method object
         tmpMethod = None
         try:
             tmpMethod = globals()[methodName]
         except Exception:
             pass
         # object not found
         if tmpMethod is None:
             tmpLog.error("is undefined")
             exeRes = "False"
         else:
             try:
                 # get params 
                 tmpPars = cgi.FieldStorage(environ['wsgi.input'], environ=environ,
                                            keep_blank_values=1)
                 # convert to map
                 params = {}
                 for tmpKey in list(tmpPars):
                     if tmpPars[tmpKey].file is not None and tmpPars[tmpKey].filename is not None:
                         # file
                         params[tmpKey] = tmpPars[tmpKey]
                     else:
                         # string
                         params[tmpKey] = tmpPars.getfirst(tmpKey)
                 if panda_config.entryVerbose:
                     tmpLog.debug("with %s" % str(list(params)))
                 # dummy request object
                 dummyReq = DummyReq(environ, tmpLog)
                 param_list = [dummyReq]
                 # exec
                 exeRes = tmpMethod(*param_list, **params)
                 # extract return type
                 if isinstance(exeRes, dict):
                     retType = exeRes['type']
                     exeRes  = exeRes['content']
                 # convert bool to string
                 if exeRes in [True,False]:
                     exeRes = str(exeRes)
             except Exception as e:
                 tmpLog.error("execution failure : {0}".format(str(e)))
                 errStr = ""
                 for tmpKey in environ:
                     tmpVal = environ[tmpKey]
                     errStr += "%s : %s\n" % (tmpKey,str(tmpVal))
                 tmpLog.error(errStr)
                 # return internal server error
                 start_response('500 INTERNAL SERVER ERROR', [('Content-Type', 'text/plain')]) 
                 return [str(e)]
     if panda_config.entryVerbose:
         tmpLog.debug("done")
     regTime = datetime.datetime.utcnow() - regStart
     tmpLog.info("exec_time=%s.%03d sec, return len=%s B" % (regTime.seconds,
                                                             regTime.microseconds/1000,
                                                             len(str(exeRes))))
     # return
     if exeRes == pandaserver.taskbuffer.ErrorCode.EC_NotFound:
         start_response('404 Not Found', [('Content-Type', 'text/plain')])
         return ['not found']
     elif isinstance(exeRes, pandaserver.taskbuffer.ErrorCode.EC_Redirect):
         start_response('302 Redirect', [('Location', exeRes.url)])
         return ['redirect']
     else:                
         if retType == 'json':
             start_response('200 OK', [('Content-Type', 'application/json')])
         else:
             start_response('200 OK', [('Content-Type', 'text/plain')])
         if isinstance(exeRes, str):
             exeRes = exeRes.encode()
         return [exeRes]
Esempio n. 22
0
class EventPicker:
    # constructor
    def __init__(self, taskBuffer, siteMapper, evpFileName, ignoreError):
        self.taskBuffer = taskBuffer
        self.siteMapper = siteMapper
        self.ignoreError = ignoreError
        self.evpFileName = evpFileName
        self.token = datetime.datetime.utcnow().isoformat(' ')
        # logger
        self.logger = LogWrapper(_logger, self.token)
        self.pd2p = DynDataDistributer.DynDataDistributer([],
                                                          self.taskBuffer,
                                                          self.siteMapper,
                                                          token=' ',
                                                          logger=self.logger)
        self.userDatasetName = ''
        self.creationTime = ''
        self.params = ''
        self.lockedBy = ''
        self.evpFile = None
        self.userTaskName = ''
        # message buffer
        self.msgBuffer = []
        self.lineLimit = 100
        # JEDI
        self.jediTaskID = None
        self.prodSourceLabel = None
        self.job_label = None

    # main
    def run(self):
        try:
            self.putLog('start %s' % self.evpFileName)
            # lock evp file
            self.evpFile = open(self.evpFileName)
            try:
                fcntl.flock(self.evpFile.fileno(),
                            fcntl.LOCK_EX | fcntl.LOCK_NB)
            except Exception:
                # relase
                self.putLog("cannot lock %s" % self.evpFileName)
                self.evpFile.close()
                return True
            # options
            runEvtList = []
            eventPickDataType = ''
            eventPickStreamName = ''
            eventPickDS = []
            eventPickAmiTag = ''
            eventPickNumSites = 1
            inputFileList = []
            tagDsList = []
            tagQuery = ''
            tagStreamRef = ''
            skipDaTRI = False
            runEvtGuidMap = {}
            ei_api = ''
            # read evp file
            for tmpLine in self.evpFile:
                tmpMatch = re.search('^([^=]+)=(.+)$', tmpLine)
                # check format
                if tmpMatch is None:
                    continue
                tmpItems = tmpMatch.groups()
                if tmpItems[0] == 'runEvent':
                    # get run and event number
                    tmpRunEvt = tmpItems[1].split(',')
                    if len(tmpRunEvt) == 2:
                        runEvtList.append(tmpRunEvt)
                elif tmpItems[0] == 'eventPickDataType':
                    # data type
                    eventPickDataType = tmpItems[1]
                elif tmpItems[0] == 'eventPickStreamName':
                    # stream name
                    eventPickStreamName = tmpItems[1]
                elif tmpItems[0] == 'eventPickDS':
                    # dataset pattern
                    eventPickDS = tmpItems[1].split(',')
                elif tmpItems[0] == 'eventPickAmiTag':
                    # AMI tag
                    eventPickAmiTag = tmpItems[1]
                elif tmpItems[0] == 'eventPickNumSites':
                    # the number of sites where datasets are distributed
                    try:
                        eventPickNumSites = int(tmpItems[1])
                    except Exception:
                        pass
                elif tmpItems[0] == 'userName':
                    # user name
                    self.userDN = tmpItems[1]
                    self.putLog("user=%s" % self.userDN)
                elif tmpItems[0] == 'userTaskName':
                    # user task name
                    self.userTaskName = tmpItems[1]
                elif tmpItems[0] == 'userDatasetName':
                    # user dataset name
                    self.userDatasetName = tmpItems[1]
                elif tmpItems[0] == 'lockedBy':
                    # client name
                    self.lockedBy = tmpItems[1]
                elif tmpItems[0] == 'creationTime':
                    # creation time
                    self.creationTime = tmpItems[1]
                elif tmpItems[0] == 'params':
                    # parameters
                    self.params = tmpItems[1]
                elif tmpItems[0] == 'ei_api':
                    # ei api parameter for MC
                    ei_api = tmpItems[1]
                elif tmpItems[0] == 'inputFileList':
                    # input file list
                    inputFileList = tmpItems[1].split(',')
                    try:
                        inputFileList.remove('')
                    except Exception:
                        pass
                elif tmpItems[0] == 'tagDS':
                    # TAG dataset
                    tagDsList = tmpItems[1].split(',')
                elif tmpItems[0] == 'tagQuery':
                    # query for TAG
                    tagQuery = tmpItems[1]
                elif tmpItems[0] == 'tagStreamRef':
                    # StreamRef for TAG
                    tagStreamRef = tmpItems[1]
                    if not tagStreamRef.endswith('_ref'):
                        tagStreamRef += '_ref'
                elif tmpItems[0] == 'runEvtGuidMap':
                    # GUIDs
                    try:
                        runEvtGuidMap = eval(tmpItems[1])
                    except Exception:
                        pass
            # extract task name
            if self.userTaskName == '' and self.params != '':
                try:
                    tmpMatch = re.search('--outDS(=| ) *([^ ]+)', self.params)
                    if tmpMatch is not None:
                        self.userTaskName = tmpMatch.group(2)
                        if not self.userTaskName.endswith('/'):
                            self.userTaskName += '/'
                except Exception:
                    pass
            # suppress DaTRI
            if self.params != '':
                if '--eventPickSkipDaTRI' in self.params:
                    skipDaTRI = True
            # get compact user name
            compactDN = self.taskBuffer.cleanUserID(self.userDN)
            # get jediTaskID
            self.jediTaskID = self.taskBuffer.getTaskIDwithTaskNameJEDI(
                compactDN, self.userTaskName)
            # get prodSourceLabel
            self.prodSourceLabel, self.job_label = self.taskBuffer.getProdSourceLabelwithTaskID(
                self.jediTaskID)
            # convert run/event list to dataset/file list
            tmpRet, locationMap, allFiles = self.pd2p.convertEvtRunToDatasets(
                runEvtList, eventPickDataType, eventPickStreamName,
                eventPickDS, eventPickAmiTag, self.userDN, runEvtGuidMap,
                ei_api)
            if not tmpRet:
                if 'isFatal' in locationMap and locationMap['isFatal'] is True:
                    self.ignoreError = False
                self.endWithError(
                    'Failed to convert the run/event list to a dataset/file list'
                )
                return False
            # use only files in the list
            if inputFileList != []:
                tmpAllFiles = []
                for tmpFile in allFiles:
                    if tmpFile['lfn'] in inputFileList:
                        tmpAllFiles.append(tmpFile)
                allFiles = tmpAllFiles
            # remove redundant CN from DN
            tmpDN = self.userDN
            tmpDN = re.sub('/CN=limited proxy', '', tmpDN)
            tmpDN = re.sub('(/CN=proxy)+$', '', tmpDN)
            # make dataset container
            tmpRet = self.pd2p.registerDatasetContainerWithDatasets(
                self.userDatasetName,
                allFiles,
                locationMap,
                nSites=eventPickNumSites,
                owner=tmpDN)
            if not tmpRet:
                self.endWithError('Failed to make a dataset container %s' %
                                  self.userDatasetName)
                return False
            # skip DaTRI
            if skipDaTRI:
                # successfully terminated
                self.putLog("skip DaTRI")
                # update task
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID)
            else:
                # get candidates
                tmpRet, candidateMaps = self.pd2p.getCandidates(
                    self.userDatasetName,
                    self.prodSourceLabel,
                    self.job_label,
                    checkUsedFile=False,
                    useHidden=True)
                if not tmpRet:
                    self.endWithError(
                        'Failed to find candidate for destination')
                    return False
                # collect all candidates
                allCandidates = []
                for tmpDS in candidateMaps:
                    tmpDsVal = candidateMaps[tmpDS]
                    for tmpCloud in tmpDsVal:
                        tmpCloudVal = tmpDsVal[tmpCloud]
                        for tmpSiteName in tmpCloudVal[0]:
                            if tmpSiteName not in allCandidates:
                                allCandidates.append(tmpSiteName)
                if allCandidates == []:
                    self.endWithError('No candidate for destination')
                    return False
                # get list of dataset (container) names
                if eventPickNumSites > 1:
                    # decompose container to transfer datasets separately
                    tmpRet, tmpOut = self.pd2p.getListDatasetReplicasInContainer(
                        self.userDatasetName)
                    if not tmpRet:
                        self.endWithError('Failed to get replicas in %s' %
                                          self.userDatasetName)
                        return False
                    userDatasetNameList = list(tmpOut)
                else:
                    # transfer container at once
                    userDatasetNameList = [self.userDatasetName]
                # loop over all datasets
                sitesUsed = []
                for tmpUserDatasetName in userDatasetNameList:
                    # get size of dataset container
                    tmpRet, totalInputSize = rucioAPI.getDatasetSize(
                        tmpUserDatasetName)
                    if not tmpRet:
                        self.endWithError(
                            'Failed to get the size of {0} with {1}'.format(
                                tmpUserDatasetName, totalInputSize))
                        return False
                    # run brokerage
                    tmpJob = JobSpec()
                    tmpJob.AtlasRelease = ''
                    self.putLog("run brokerage for %s" % tmpDS)
                    pandaserver.brokerage.broker.schedule(
                        [tmpJob],
                        self.taskBuffer,
                        self.siteMapper,
                        True,
                        allCandidates,
                        True,
                        datasetSize=totalInputSize)
                    if tmpJob.computingSite.startswith('ERROR'):
                        self.endWithError('brokerage failed with %s' %
                                          tmpJob.computingSite)
                        return False
                    self.putLog("site -> %s" % tmpJob.computingSite)
                    # send transfer request
                    try:
                        tmpDN = rucioAPI.parse_dn(tmpDN)
                        tmpStatus, userInfo = rucioAPI.finger(tmpDN)
                        if not tmpStatus:
                            raise RuntimeError(
                                'user info not found for {0} with {1}'.format(
                                    tmpDN, userInfo))
                        tmpDN = userInfo['nickname']
                        tmpSiteSpec = self.siteMapper.getSite(
                            tmpJob.computingSite)
                        scope_input, scope_output = select_scope(
                            tmpSiteSpec, JobUtils.ANALY_PS, JobUtils.ANALY_PS)
                        tmpDQ2ID = tmpSiteSpec.ddm_input[scope_input]
                        tmpMsg = "%s ds=%s site=%s id=%s" % (
                            'registerDatasetLocation for DaTRI ',
                            tmpUserDatasetName, tmpDQ2ID, tmpDN)
                        self.putLog(tmpMsg)
                        rucioAPI.registerDatasetLocation(
                            tmpDS, [tmpDQ2ID],
                            lifetime=14,
                            owner=tmpDN,
                            activity="User Subscriptions")
                        self.putLog('OK')
                    except Exception:
                        errType, errValue = sys.exc_info()[:2]
                        tmpStr = 'Failed to send transfer request : %s %s' % (
                            errType, errValue)
                        tmpStr.strip()
                        tmpStr += traceback.format_exc()
                        self.endWithError(tmpStr)
                        return False
                    # list of sites already used
                    sitesUsed.append(tmpJob.computingSite)
                    self.putLog("used %s sites" % len(sitesUsed))
                    # set candidates
                    if len(sitesUsed) >= eventPickNumSites:
                        # reset candidates to limit the number of sites
                        allCandidates = sitesUsed
                        sitesUsed = []
                    else:
                        # remove site
                        allCandidates.remove(tmpJob.computingSite)
                # send email notification for success
                tmpMsg = 'A transfer request was successfully sent to Rucio.\n'
                tmpMsg += 'Your task will get started once transfer is completed.'
                self.sendEmail(True, tmpMsg)
            try:
                # unlock and delete evp file
                fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN)
                self.evpFile.close()
                os.remove(self.evpFileName)
            except Exception:
                pass
            # successfully terminated
            self.putLog("end %s" % self.evpFileName)
            return True
        except Exception:
            errType, errValue = sys.exc_info()[:2]
            self.endWithError('Got exception %s:%s %s' %
                              (errType, errValue, traceback.format_exc()))
            return False

    # end with error
    def endWithError(self, message):
        self.putLog(message, 'error')
        # unlock evp file
        try:
            fcntl.flock(self.evpFile.fileno(), fcntl.LOCK_UN)
            self.evpFile.close()
            if not self.ignoreError:
                # remove evp file
                os.remove(self.evpFileName)
                # send email notification
                self.sendEmail(False, message)
        except Exception:
            pass
        # upload log
        if self.jediTaskID is not None:
            outLog = self.uploadLog()
            self.taskBuffer.updateTaskErrorDialogJEDI(
                self.jediTaskID, 'event picking failed. ' + outLog)
            # update task
            if not self.ignoreError:
                self.taskBuffer.updateTaskModTimeJEDI(self.jediTaskID,
                                                      'tobroken')
            self.putLog(outLog)
        self.putLog('end %s' % self.evpFileName)

    # put log
    def putLog(self, msg, type='debug'):
        tmpMsg = msg
        if type == 'error':
            self.logger.error(tmpMsg)
        else:
            self.logger.debug(tmpMsg)

    # send email notification
    def sendEmail(self, isSucceeded, message):
        # mail address
        toAdder = Notifier(self.taskBuffer, None, []).getEmail(self.userDN)
        if toAdder == '':
            self.putLog('cannot find email address for %s' % self.userDN,
                        'error')
            return
        # subject
        mailSubject = "PANDA notification for Event-Picking Request"
        # message
        mailBody = "Hello,\n\nHere is your request status for event picking\n\n"
        if isSucceeded:
            mailBody += "Status  : Passed to Rucio\n"
        else:
            mailBody += "Status  : Failed\n"
        mailBody += "Created : %s\n" % self.creationTime
        mailBody += "Ended   : %s\n" % datetime.datetime.utcnow().strftime(
            '%Y-%m-%d %H:%M:%S')
        mailBody += "Dataset : %s\n" % self.userDatasetName
        mailBody += "\n"
        mailBody += "Parameters : %s %s\n" % (self.lockedBy, self.params)
        mailBody += "\n"
        mailBody += "%s\n" % message
        # send
        retVal = MailUtils().send(toAdder, mailSubject, mailBody)
        # return
        return

    # upload log
    def uploadLog(self):
        if self.jediTaskID is None:
            return 'cannot find jediTaskID'
        strMsg = self.logger.dumpToString()
        s, o = Client.uploadLog(strMsg, self.jediTaskID)
        if s != 0:
            return "failed to upload log with {0}.".format(s)
        if o.startswith('http'):
            return '<a href="{0}">log</a>'.format(o)
        return o
Esempio n. 23
0
 def checkProxy(self, user_dn, production=False, role=None, name=None):
     log_stream = LogWrapper(_logger,
                             '< name="{}" role={} >'.format(name, role))
     log_stream.info('check proxy for {}'.format(user_dn))
     """Check the validity of a proxy."""
     if role is not None:
         tmpExtension = self.getExtension(role)
         proxy_path = os.path.join(
             self.__target_path,
             str(hashlib.sha1(six.b(user_dn + tmpExtension)).hexdigest()))
     elif production:
         proxy_path = os.path.join(
             self.__target_path,
             str(hashlib.sha1(six.b(user_dn + '.prod')).hexdigest()))
     else:
         proxy_path = os.path.join(self.__target_path,
                                   hashlib.sha1(six.b(user_dn)).hexdigest())
     isOK = False
     if os.path.isfile(proxy_path):
         log_stream.info('proxy is there. Need to check validity')
         cmd = "voms-proxy-info -exists -hours 94 -file %s" % proxy_path
         stdout, stderr, status = execute(cmd, log_stream)
         if stdout:
             log_stream.info('stdout is %s ' % stdout)
         if stderr:
             log_stream.info('stderr is %s ' % stderr)
         if status == 1:
             log_stream.info(
                 'proxy expires in 94h or less. We need to renew proxy!')
             ret = self.store(user_dn,
                              self.__cred_name,
                              production,
                              role=role,
                              log_stream=log_stream)
             if ret == 0:
                 log_stream.info('proxy retrieval successful')
                 isOK = True
             elif ret == 2:
                 log_stream.info('proxy retrieval on hold')
             else:
                 log_stream.error('proxy retrieval failed')
         else:
             log_stream.info('proxy is valid for more than 3 days')
             isOK = True
     else:
         log_stream.info(
             'proxy is not in the cache repo. will try to get it from myproxy'
         )
         ret = self.store(user_dn,
                          self.__cred_name,
                          production,
                          role=role,
                          log_stream=log_stream)
         if ret == 0:
             log_stream.info('proxy stored successfully')
             isOK = True
         elif ret == 2:
             log_stream.info('proxy retrieval on hold')
         else:
             log_stream.error('proxy retrieval failed')
     if isOK:
         plain_path = os.path.join(
             self.__target_path,
             hashlib.sha1(six.b(user_dn + '.plain')).hexdigest())
         if os.path.isfile(plain_path):
             return self.checkValidity(plain_path, log_stream)
         else:
             log_stream.error('plain proxy not there at the moment!')
Esempio n. 24
0
class AdderGen(object):
    # constructor
    def __init__(self,
                 taskBuffer,
                 jobID,
                 jobStatus,
                 attemptNr,
                 ignoreTmpError=True,
                 siteMapper=None,
                 pid=None,
                 prelock_pid=None,
                 lock_offset=10):
        self.job = None
        self.jobID = jobID
        self.jobStatus = jobStatus
        self.taskBuffer = taskBuffer
        self.ignoreTmpError = ignoreTmpError
        self.lock_offset = lock_offset
        self.siteMapper = siteMapper
        self.datasetMap = {}
        self.extraInfo = {
            'surl': {},
            'nevents': {},
            'lbnr': {},
            'endpoint': {},
            'guid': {}
        }
        self.attemptNr = attemptNr
        self.pid = pid
        self.prelock_pid = prelock_pid
        self.data = None
        # logger
        self.logger = LogWrapper(_logger, str(self.jobID))

    # dump file report
    def dumpFileReport(self, fileCatalog, attemptNr):
        self.logger.debug("dump file report")
        # dump Catalog into file
        # if attemptNr is None:
        #     xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus,
        #                                str(uuid.uuid4()))
        # else:
        #     xmlFile = '%s/%s_%s_%s_%s' % (panda_config.logdir,self.jobID,self.jobStatus,
        #                                   str(uuid.uuid4()),attemptNr)
        # file = open(xmlFile,'w')
        # file.write(fileCatalog)
        # file.close()
        # dump Catalog into job output report table
        attempt_nr = 0 if attemptNr is None else attemptNr
        if self.job is None:
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
        if self.job:
            self.taskBuffer.insertJobOutputReport(
                panda_id=self.jobID,
                prod_source_label=self.job.prodSourceLabel,
                job_status=self.jobStatus,
                attempt_nr=attempt_nr,
                data=fileCatalog)

    # get plugin class
    def getPluginClass(self, tmpVO, tmpGroup):
        # instantiate concrete plugin
        adderPluginClass = panda_config.getPlugin('adder_plugins', tmpVO,
                                                  tmpGroup)
        if adderPluginClass is None:
            # use ATLAS plugin by default
            from pandaserver.dataservice.AdderAtlasPlugin import AdderAtlasPlugin
            adderPluginClass = AdderAtlasPlugin
        self.logger.debug('plugin name {0}'.format(adderPluginClass.__name__))
        return adderPluginClass

    # main
    def run(self):
        try:
            self.logger.debug("new start: %s attemptNr=%s" %
                              (self.jobStatus, self.attemptNr))

            # got lock, get the report
            report_dict = self.taskBuffer.getJobOutputReport(
                panda_id=self.jobID, attempt_nr=self.attemptNr)
            self.data = report_dict.get('data')

            # query job
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
            # check if job has finished
            if self.job is None:
                self.logger.debug(': job not found in DB')
            elif self.job.jobStatus in [
                    'finished', 'failed', 'unknown', 'merging'
            ]:
                self.logger.error(': invalid state -> %s' % self.job.jobStatus)
            elif self.attemptNr is not None and self.job.attemptNr != self.attemptNr:
                self.logger.error('wrong attemptNr -> job=%s <> %s' %
                                  (self.job.attemptNr, self.attemptNr))
            # elif self.attemptNr is not None and self.job.jobStatus == 'transferring':
            #     errMsg = 'XML with attemptNr for {0}'.format(self.job.jobStatus)
            #     self.logger.error(errMsg)
            elif self.jobStatus == EventServiceUtils.esRegStatus:
                # instantiate concrete plugin
                adderPluginClass = self.getPluginClass(self.job.VO,
                                                       self.job.cloud)
                adderPlugin = adderPluginClass(self.job,
                                               taskBuffer=self.taskBuffer,
                                               siteMapper=self.siteMapper,
                                               logger=self.logger)
                # execute
                self.logger.debug('plugin is ready for ES file registration')
                adderPlugin.registerEventServiceFiles()
            else:
                # check file status in JEDI
                if not self.job.isCancelled() and self.job.taskBufferErrorCode not in \
                                                      [pandaserver.taskbuffer.ErrorCode.EC_PilotRetried]:
                    fileCheckInJEDI = self.taskBuffer.checkInputFileStatusInJEDI(
                        self.job)
                    self.logger.debug("check file status in JEDI : {0}".format(
                        fileCheckInJEDI))
                    if fileCheckInJEDI is None:
                        raise RuntimeError(
                            'failed to check file status in JEDI')
                    if fileCheckInJEDI is False:
                        # set job status to failed since some file status is wrong in JEDI
                        self.jobStatus = 'failed'
                        self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                        errStr = "inconsistent file status between Panda and JEDI. "
                        errStr += "failed to avoid duplicated processing caused by synchronization failure"
                        self.job.ddmErrorDiag = errStr
                        self.logger.debug(
                            "set jobStatus={0} since input is inconsistent between Panda and JEDI"
                            .format(self.jobStatus))
                    elif self.job.jobSubStatus in ['pilot_closed']:
                        # terminated by the pilot
                        self.logger.debug(
                            "going to closed since terminated by the pilot")
                        retClosed = self.taskBuffer.killJobs([self.jobID],
                                                             'pilot', '60',
                                                             True)
                        if retClosed[0] is True:
                            self.logger.debug("end")
                            # remove Catalog
                            self.taskBuffer.deleteJobOutputReport(
                                panda_id=self.jobID, attempt_nr=self.attemptNr)
                            return
                    # check for cloned jobs
                    if EventServiceUtils.isJobCloningJob(self.job):
                        checkJC = self.taskBuffer.checkClonedJob(self.job)
                        if checkJC is None:
                            raise RuntimeError(
                                'failed to check the cloned job')
                        # failed to lock semaphore
                        if checkJC['lock'] is False:
                            self.jobStatus = 'failed'
                            self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                            self.job.ddmErrorDiag = "failed to lock semaphore for job cloning"
                            self.logger.debug(
                                "set jobStatus={0} since did not get semaphore for job cloning"
                                .format(self.jobStatus))
                # use failed for cancelled/closed jobs
                if self.job.isCancelled():
                    self.jobStatus = 'failed'
                    # reset error codes to skip retrial module
                    self.job.pilotErrorCode = 0
                    self.job.exeErrorCode = 0
                    self.job.ddmErrorCode = 0
                # keep old status
                oldJobStatus = self.job.jobStatus
                # set job status
                if self.job.jobStatus not in ['transferring']:
                    self.job.jobStatus = self.jobStatus
                addResult = None
                adderPlugin = None
                # parse XML
                parseResult = self.parseXML()
                if parseResult < 2:
                    # interaction with DDM
                    try:
                        # instantiate concrete plugin
                        adderPluginClass = self.getPluginClass(
                            self.job.VO, self.job.cloud)
                        adderPlugin = adderPluginClass(
                            self.job,
                            taskBuffer=self.taskBuffer,
                            siteMapper=self.siteMapper,
                            extraInfo=self.extraInfo,
                            logger=self.logger)
                        # execute
                        self.logger.debug('plugin is ready')
                        adderPlugin.execute()
                        addResult = adderPlugin.result
                        self.logger.debug('plugin done with %s' %
                                          (addResult.statusCode))
                    except Exception:
                        errtype, errvalue = sys.exc_info()[:2]
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}:{2}"
                            .format(self.job.VO, errtype, errvalue))
                        self.logger.error(
                            "failed to execute AdderPlugin for VO={0} with {1}"
                            .format(self.job.VO, traceback.format_exc()))
                        addResult = None
                        self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "AdderPlugin failure"

                    # ignore temporary errors
                    if self.ignoreTmpError and addResult is not None and addResult.isTemporary(
                    ):
                        self.logger.debug(': ignore %s ' %
                                          self.job.ddmErrorDiag)
                        self.logger.debug('escape')
                        # unlock job output report
                        self.taskBuffer.unlockJobOutputReport(
                            panda_id=self.jobID,
                            attempt_nr=self.attemptNr,
                            pid=self.pid,
                            lock_offset=self.lock_offset)
                        return
                    # failed
                    if addResult is None or not addResult.isSucceeded():
                        self.job.jobStatus = 'failed'
                # set file status for failed jobs or failed transferring jobs
                self.logger.debug(
                    "status after plugin call :job.jobStatus=%s jobStatus=%s" %
                    (self.job.jobStatus, self.jobStatus))
                if self.job.jobStatus == 'failed' or self.jobStatus == 'failed':
                    # First of all: check if job failed and in this case take first actions according to error table
                    source, error_code, error_diag = None, None, None
                    errors = []
                    if self.job.pilotErrorCode:
                        source = 'pilotErrorCode'
                        error_code = self.job.pilotErrorCode
                        error_diag = self.job.pilotErrorDiag
                        errors.append({
                            'source': source,
                            'error_code': error_code,
                            'error_diag': error_diag
                        })
                    if self.job.exeErrorCode:
                        source = 'exeErrorCode'
                        error_code = self.job.exeErrorCode
                        error_diag = self.job.exeErrorDiag
                        errors.append({
                            'source': source,
                            'error_code': error_code,
                            'error_diag': error_diag
                        })
                    if self.job.ddmErrorCode:
                        source = 'ddmErrorCode'
                        error_code = self.job.ddmErrorCode
                        error_diag = self.job.ddmErrorDiag
                        errors.append({
                            'source': source,
                            'error_code': error_code,
                            'error_diag': error_diag
                        })
                    if self.job.transExitCode:
                        source = 'transExitCode'
                        error_code = self.job.transExitCode
                        error_diag = ''
                        errors.append({
                            'source': source,
                            'error_code': error_code,
                            'error_diag': error_diag
                        })

                    # _logger.info("updatejob has source %s, error_code %s and error_diag %s"%(source, error_code, error_diag))

                    if source and error_code:
                        try:
                            self.logger.debug(
                                "AdderGen.run will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, self.job.PandaID, errors,
                                self.job.attemptNr)
                            self.logger.debug("apply_retrial_rules is back")
                        except Exception as e:
                            self.logger.error(
                                "apply_retrial_rules excepted and needs to be investigated (%s): %s"
                                % (e, traceback.format_exc()))

                    self.job.jobStatus = 'failed'
                    for file in self.job.Files:
                        if file.type in ['output', 'log']:
                            if addResult is not None and file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                            else:
                                file.status = 'failed'
                else:
                    # reset errors
                    self.job.jobDispatcherErrorCode = 0
                    self.job.jobDispatcherErrorDiag = 'NULL'
                    # set status
                    if addResult is not None and addResult.mergingFiles != []:
                        # set status for merging:
                        for file in self.job.Files:
                            if file.lfn in addResult.mergingFiles:
                                file.status = 'merging'
                        self.job.jobStatus = 'merging'
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    elif addResult is not None and addResult.transferringFiles != []:
                        # set status for transferring
                        for file in self.job.Files:
                            if file.lfn in addResult.transferringFiles:
                                file.status = 'transferring'
                        self.job.jobStatus = 'transferring'
                        self.job.jobSubStatus = None
                        # propagate transition to prodDB
                        self.job.stateChangeTime = time.strftime(
                            '%Y-%m-%d %H:%M:%S', time.gmtime())
                    else:
                        self.job.jobStatus = 'finished'
                # endtime
                if self.job.endTime == 'NULL':
                    self.job.endTime = time.strftime('%Y-%m-%d %H:%M:%S',
                                                     time.gmtime())
                # output size and # of outputs
                self.job.nOutputDataFiles = 0
                self.job.outputFileBytes = 0
                for tmpFile in self.job.Files:
                    if tmpFile.type == 'output':
                        self.job.nOutputDataFiles += 1
                        try:
                            self.job.outputFileBytes += tmpFile.fsize
                        except Exception:
                            pass
                # protection
                maxOutputFileBytes = 99999999999
                if self.job.outputFileBytes > maxOutputFileBytes:
                    self.job.outputFileBytes = maxOutputFileBytes
                # set cancelled state
                if self.job.commandToPilot == 'tobekilled' and self.job.jobStatus == 'failed':
                    self.job.jobStatus = 'cancelled'
                # update job
                if oldJobStatus in ['cancelled', 'closed']:
                    pass
                else:
                    self.logger.debug("updating DB")
                    retU = self.taskBuffer.updateJobs(
                        [self.job],
                        False,
                        oldJobStatusList=[oldJobStatus],
                        extraInfo=self.extraInfo)
                    self.logger.debug("retU: %s" % retU)
                    # failed
                    if not retU[0]:
                        self.logger.error(
                            'failed to update DB for pandaid={0}'.format(
                                self.job.PandaID))
                        # unlock job output report
                        self.taskBuffer.unlockJobOutputReport(
                            panda_id=self.jobID,
                            attempt_nr=self.attemptNr,
                            pid=self.pid,
                            lock_offset=self.lock_offset)
                        return

                    try:
                        # updateJobs was successful and it failed a job with taskBufferErrorCode
                        self.logger.debug("AdderGen.run will peek the job")
                        job_tmp = self.taskBuffer.peekJobs(
                            [self.job.PandaID],
                            fromDefined=False,
                            fromArchived=True,
                            fromWaiting=False)[0]
                        self.logger.debug(
                            "status {0}, taskBufferErrorCode {1}, taskBufferErrorDiag {2}"
                            .format(job_tmp.jobStatus,
                                    job_tmp.taskBufferErrorCode,
                                    job_tmp.taskBufferErrorDiag))
                        if job_tmp.jobStatus == 'failed' and job_tmp.taskBufferErrorCode:
                            source = 'taskBufferErrorCode'
                            error_code = job_tmp.taskBufferErrorCode
                            error_diag = job_tmp.taskBufferErrorDiag
                            errors = [{
                                'source': source,
                                'error_code': error_code,
                                'error_diag': error_diag
                            }]
                            self.logger.debug(
                                "AdderGen.run 2 will call apply_retrial_rules")
                            retryModule.apply_retrial_rules(
                                self.taskBuffer, job_tmp.PandaID, errors,
                                job_tmp.attemptNr)
                            self.logger.debug("apply_retrial_rules 2 is back")
                    except IndexError:
                        pass
                    except Exception as e:
                        self.logger.error(
                            "apply_retrial_rules 2 excepted and needs to be investigated (%s): %s"
                            % (e, traceback.format_exc()))

                    # setup for closer
                    if not (EventServiceUtils.isEventServiceJob(self.job)
                            and self.job.isCancelled()):
                        destDBList = []
                        guidList = []
                        for file in self.job.Files:
                            # ignore inputs
                            if file.type == 'input':
                                continue
                            # skip pseudo datasets
                            if file.destinationDBlock in ['', None, 'NULL']:
                                continue
                            # start closer for output/log datasets
                            if file.destinationDBlock not in destDBList:
                                destDBList.append(file.destinationDBlock)
                            # collect GUIDs
                            if (self.job.prodSourceLabel=='panda' or (self.job.prodSourceLabel in ['rucio_test'] + JobUtils.list_ptest_prod_sources and \
                                                                      self.job.processingType in ['pathena','prun','gangarobot-rctest','hammercloud'])) \
                                                                      and file.type == 'output':
                                # extract base LFN since LFN was changed to full LFN for CMS
                                baseLFN = file.lfn.split('/')[-1]
                                guidList.append({
                                    'lfn': baseLFN,
                                    'guid': file.GUID,
                                    'type': file.type,
                                    'checksum': file.checksum,
                                    'md5sum': file.md5sum,
                                    'fsize': file.fsize,
                                    'scope': file.scope
                                })
                        if guidList != []:
                            retG = self.taskBuffer.setGUIDs(guidList)
                        if destDBList != []:
                            # start Closer
                            if adderPlugin is not None and hasattr(
                                    adderPlugin, 'datasetMap'
                            ) and adderPlugin.datasetMap != {}:
                                cThr = Closer.Closer(
                                    self.taskBuffer,
                                    destDBList,
                                    self.job,
                                    datasetMap=adderPlugin.datasetMap)
                            else:
                                cThr = Closer.Closer(self.taskBuffer,
                                                     destDBList, self.job)
                            self.logger.debug("start Closer")
                            # cThr.start()
                            # cThr.join()
                            cThr.run()
                            del cThr
                            self.logger.debug("end Closer")
                        # run closer for assocaiate parallel jobs
                        if EventServiceUtils.isJobCloningJob(self.job):
                            assDBlockMap = self.taskBuffer.getDestDBlocksWithSingleConsumer(
                                self.job.jediTaskID, self.job.PandaID,
                                destDBList)
                            for assJobID in assDBlockMap:
                                assDBlocks = assDBlockMap[assJobID]
                                assJob = self.taskBuffer.peekJobs(
                                    [assJobID],
                                    fromDefined=False,
                                    fromArchived=False,
                                    fromWaiting=False,
                                    forAnal=True)[0]
                                if self.job is None:
                                    self.logger.debug(
                                        ': associated job PandaID={0} not found in DB'
                                        .format(assJobID))
                                else:
                                    cThr = Closer.Closer(
                                        self.taskBuffer, assDBlocks, assJob)
                                    self.logger.debug(
                                        "start Closer for PandaID={0}".format(
                                            assJobID))
                                    # cThr.start()
                                    # cThr.join()
                                    cThr.run()
                                    del cThr
                                    self.logger.debug(
                                        "end Closer for PandaID={0}".format(
                                            assJobID))
            self.logger.debug("end")
            # try:
            #     # remove Catalog
            #     os.remove(self.xmlFile)
            # except Exception:
            #     pass
            # remove Catalog
            self.taskBuffer.deleteJobOutputReport(panda_id=self.jobID,
                                                  attempt_nr=self.attemptNr)
            del self.data
            del report_dict
        except Exception as e:
            errStr = ": {} {}".format(str(e), traceback.format_exc())
            self.logger.error(errStr)
            self.logger.error("except")
            # unlock job output report
            self.taskBuffer.unlockJobOutputReport(panda_id=self.jobID,
                                                  attempt_nr=self.attemptNr,
                                                  pid=self.pid,
                                                  lock_offset=self.lock_offset)

    # parse XML
    # 0: succeeded, 1: harmless error to exit, 2: fatal error, 3: event service
    def parseXML(self):
        # get LFN and GUID
        # self.logger.debug('XML filename : %s' % self.xmlFile)
        # no outputs
        log_out = [f for f in self.job.Files if f.type in ['log', 'output']]
        if not log_out:
            self.logger.debug("has no outputs")
            self.logger.debug("parseXML end")
            return 0
        # get input files
        inputLFNs = []
        for file in self.job.Files:
            if file.type == 'input':
                inputLFNs.append(file.lfn)
        # parse XML
        lfns = []
        guids = []
        fsizes = []
        md5sums = []
        chksums = []
        surls = []
        fullLfnMap = {}
        nEventsMap = {}
        guidMap = dict()
        try:
            # root  = xml.dom.minidom.parse(self.xmlFile)
            root = xml.dom.minidom.parseString(self.data)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical = file.getElementsByTagName('logical')[0]
                lfnNode = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                # get metadata
                fsize = None
                md5sum = None
                adler32 = None
                surl = None
                fullLFN = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'fsize':
                        fsize = long(meta.getAttribute('att_value'))
                    elif name == 'md5sum':
                        md5sum = str(meta.getAttribute('att_value'))
                        # check
                        if re.search("^[a-fA-F0-9]{32}$", md5sum) is None:
                            md5sum = None
                    elif name == 'adler32':
                        adler32 = str(meta.getAttribute('att_value'))
                    elif name == 'surl':
                        surl = str(meta.getAttribute('att_value'))
                    elif name == 'full_lfn':
                        fullLFN = str(meta.getAttribute('att_value'))
                # endpoints
                self.extraInfo['endpoint'][lfn] = []
                for epNode in file.getElementsByTagName('endpoint'):
                    self.extraInfo['endpoint'][lfn].append(
                        str(epNode.firstChild.data))
                # error check
                if (lfn not in inputLFNs) and (fsize is None or
                                               (md5sum is None
                                                and adler32 is None)):
                    if EventServiceUtils.isEventServiceMerge(self.job):
                        continue
                    else:
                        raise RuntimeError('fsize/md5sum/adler32/surl=None')
                # append
                lfns.append(lfn)
                guids.append(guid)
                fsizes.append(fsize)
                md5sums.append(md5sum)
                surls.append(surl)
                if adler32 is not None:
                    # use adler32 if available
                    chksums.append("ad:%s" % adler32)
                else:
                    chksums.append("md5:%s" % md5sum)
                if fullLFN is not None:
                    fullLfnMap[lfn] = fullLFN
        except Exception:
            # parse json
            try:
                import json
                # with open(self.xmlFile) as tmpF:
                jsonDict = json.loads(self.data)
                for lfn in jsonDict:
                    fileData = jsonDict[lfn]
                    lfn = str(lfn)
                    fsize = None
                    md5sum = None
                    adler32 = None
                    surl = None
                    fullLFN = None
                    guid = str(fileData['guid'])
                    if 'fsize' in fileData:
                        fsize = long(fileData['fsize'])
                    if 'md5sum' in fileData:
                        md5sum = str(fileData['md5sum'])
                        # check
                        if re.search("^[a-fA-F0-9]{32}$", md5sum) is None:
                            md5sum = None
                    if 'adler32' in fileData:
                        adler32 = str(fileData['adler32'])
                    if 'surl' in fileData:
                        surl = str(fileData['surl'])
                    if 'full_lfn' in fileData:
                        fullLFN = str(fileData['full_lfn'])
                    # endpoints
                    self.extraInfo['endpoint'][lfn] = []
                    if 'endpoint' in fileData:
                        self.extraInfo['endpoint'][lfn] = fileData['endpoint']
                    # error check
                    if (lfn not in inputLFNs) and (fsize is None or
                                                   (md5sum is None
                                                    and adler32 is None)):
                        if EventServiceUtils.isEventServiceMerge(self.job):
                            continue
                        else:
                            raise RuntimeError(
                                'fsize/md5sum/adler32/surl=None')
                    # append
                    lfns.append(lfn)
                    guids.append(guid)
                    fsizes.append(fsize)
                    md5sums.append(md5sum)
                    surls.append(surl)
                    if adler32 is not None:
                        # use adler32 if available
                        chksums.append("ad:%s" % adler32)
                    else:
                        chksums.append("md5:%s" % md5sum)
                    if fullLFN is not None:
                        fullLfnMap[lfn] = fullLFN
            except Exception:
                # check if file exists
                # if os.path.exists(self.xmlFile):
                if True:
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type, value))
                    # set failed anyway
                    self.job.jobStatus = 'failed'
                    # XML error happens when pilot got killed due to wall-time limit or failures in wrapper
                    if (self.job.pilotErrorCode in [0,'0','NULL']) and \
                       (self.job.taskBufferErrorCode not in [pandaserver.taskbuffer.ErrorCode.EC_WorkerDone]) and \
                       (self.job.transExitCode  in [0,'0','NULL']):
                        self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "Could not get GUID/LFN/MD5/FSIZE/SURL from pilot XML"
                    return 2
                else:
                    # XML was deleted
                    return 1
        # parse metadata to get nEvents
        nEventsFrom = None
        try:
            root = xml.dom.minidom.parseString(self.job.metadata)
            files = root.getElementsByTagName('File')
            for file in files:
                # get GUID
                guid = str(file.getAttribute('ID'))
                # get PFN and LFN nodes
                logical = file.getElementsByTagName('logical')[0]
                lfnNode = logical.getElementsByTagName('lfn')[0]
                # convert UTF8 to Raw
                lfn = str(lfnNode.getAttribute('name'))
                guidMap[lfn] = guid
                # get metadata
                nevents = None
                for meta in file.getElementsByTagName('metadata'):
                    # get fsize
                    name = str(meta.getAttribute('att_name'))
                    if name == 'events':
                        nevents = long(meta.getAttribute('att_value'))
                        nEventsMap[lfn] = nevents
                        break
            nEventsFrom = "xml"
        except Exception:
            pass
        # parse json
        try:
            import json
            jsonDict = json.loads(self.job.metadata)
            for jsonFileItem in jsonDict['files']['output']:
                for jsonSubFileItem in jsonFileItem['subFiles']:
                    lfn = str(jsonSubFileItem['name'])
                    try:
                        nevents = long(jsonSubFileItem['nentries'])
                        nEventsMap[lfn] = nevents
                    except Exception:
                        pass
                    try:
                        guid = str(jsonSubFileItem['file_guid'])
                        guidMap[lfn] = guid
                    except Exception:
                        pass
            nEventsFrom = "json"
        except Exception:
            pass
        # use nEvents and GUIDs reported by the pilot if no job report
        if self.job.metadata == 'NULL' and self.jobStatus == 'finished' and self.job.nEvents > 0 \
                and self.job.prodSourceLabel in ['managed']:
            for file in self.job.Files:
                if file.type == 'output':
                    nEventsMap[file.lfn] = self.job.nEvents
            for lfn, guid in zip(lfns, guids):
                guidMap[lfn] = guid
            nEventsFrom = "pilot"
        self.logger.debug('nEventsMap=%s' % str(nEventsMap))
        self.logger.debug('nEventsFrom=%s' % str(nEventsFrom))
        self.logger.debug('guidMap=%s' % str(guidMap))
        self.logger.debug('self.job.jobStatus=%s in parseXML' %
                          self.job.jobStatus)
        self.logger.debug(
            'isES=%s isJumbo=%s' % (EventServiceUtils.isEventServiceJob(
                self.job), EventServiceUtils.isJumboJob(self.job)))
        # get lumi block number
        lumiBlockNr = self.job.getLumiBlockNr()
        # copy files for variable number of outputs
        tmpStat = self.copyFilesForVariableNumOutputs(lfns)
        if not tmpStat:
            self.logger.error(
                "failed to copy files for variable number of outputs")
            return 2
        # check files
        fileList = []
        for file in self.job.Files:
            fileList.append(file.lfn)
            if file.type == 'input':
                if file.lfn in lfns:
                    if self.job.prodSourceLabel in ['user', 'panda']:
                        # skipped file
                        file.status = 'skipped'
                    elif self.job.prodSourceLabel in [
                            'managed', 'test'
                    ] + JobUtils.list_ptest_prod_sources:
                        # failed by pilot
                        file.status = 'failed'
            elif file.type == 'output' or file.type == 'log':
                # add only log file for failed jobs
                if self.jobStatus == 'failed' and file.type != 'log':
                    file.status = 'failed'
                    continue
                # set failed if it is missing in XML
                if file.lfn not in lfns:
                    if (self.job.jobStatus == 'finished' and EventServiceUtils.isEventServiceJob(self.job)) \
                            or EventServiceUtils.isJumboJob(self.job):
                        # unset file status for ES jobs
                        pass
                    elif file.isAllowedNoOutput():
                        # allowed not to be produced
                        file.status = 'nooutput'
                        self.logger.debug('set {0} to status={1}'.format(
                            file.lfn, file.status))
                    else:
                        file.status = 'failed'
                        self.job.jobStatus = 'failed'
                        self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                        self.job.ddmErrorDiag = "expected output {0} is missing in pilot XML".format(
                            file.lfn)
                        self.logger.error(self.job.ddmErrorDiag)
                    continue
                # look for GUID with LFN
                try:
                    i = lfns.index(file.lfn)
                    file.GUID = guids[i]
                    file.fsize = fsizes[i]
                    file.md5sum = md5sums[i]
                    file.checksum = chksums[i]
                    surl = surls[i]
                    # status
                    file.status = 'ready'
                    # change to full LFN
                    if file.lfn in fullLfnMap:
                        file.lfn = fullLfnMap[file.lfn]
                    # add SURL to extraInfo
                    self.extraInfo['surl'][file.lfn] = surl
                    # add nevents
                    if file.lfn in nEventsMap:
                        self.extraInfo['nevents'][file.lfn] = nEventsMap[
                            file.lfn]
                except Exception:
                    # status
                    file.status = 'failed'
                    type, value, traceBack = sys.exc_info()
                    self.logger.error(": %s %s" % (type, value))
                # set lumi block number
                if lumiBlockNr is not None and file.status != 'failed':
                    self.extraInfo['lbnr'][file.lfn] = lumiBlockNr
        self.extraInfo['guid'] = guidMap
        # check consistency between XML and filesTable
        for lfn in lfns:
            if lfn not in fileList:
                self.logger.error("%s is not found in filesTable" % lfn)
                self.job.jobStatus = 'failed'
                for tmpFile in self.job.Files:
                    tmpFile.status = 'failed'
                self.job.ddmErrorCode = pandaserver.dataservice.ErrorCode.EC_Adder
                self.job.ddmErrorDiag = "pilot produced {0} inconsistently with jobdef".format(
                    lfn)
                return 2
        # return
        self.logger.debug("parseXML end")
        return 0

    # copy files for variable number of outputs
    def copyFilesForVariableNumOutputs(self, lfns):
        # get original output files
        origOutputs = {}
        updateOrig = {}
        for tmpFile in self.job.Files:
            if tmpFile.type in ['output', 'log']:
                origOutputs[tmpFile.lfn] = tmpFile
                if tmpFile.lfn in lfns:
                    # keep original
                    updateOrig[tmpFile.lfn] = False
                else:
                    # overwrite original
                    updateOrig[tmpFile.lfn] = True
        # look for unkown files
        addedNewFiles = False
        for newLFN in lfns:
            if newLFN not in origOutputs:
                # look for corresponding original output
                for origLFN in origOutputs:
                    tmpPatt = '^{0}\.*_\d+$'.format(origLFN)
                    if re.search(tmpPatt, newLFN) is not None:
                        # copy file record
                        tmpStat = self.taskBuffer.copyFileRecord(
                            newLFN, origOutputs[origLFN], updateOrig[origLFN])
                        if not tmpStat:
                            return False
                        addedNewFiles = True
                        # disable further overwriting
                        updateOrig[origLFN] = False
                        break
        # refresh job info
        if addedNewFiles:
            self.job = self.taskBuffer.peekJobs([self.jobID],
                                                fromDefined=False,
                                                fromWaiting=False,
                                                forAnal=True)[0]
        # return
        return True
Esempio n. 25
0
 def __init__(self, job, datasets, log):
     self.jobSpec = job
     self.datasets = datasets
     self.tmpLog = LogWrapper(
         log, "{0} CloserAtlasPlugin".format(self.jobSpec.PandaID))
Esempio n. 26
0
import re
import sys
import datetime
import traceback
from pandaserver.taskbuffer.TaskBuffer import taskBuffer
from pandacommon.pandalogger.PandaLogger import PandaLogger
from pandacommon.pandalogger.LogWrapper import LogWrapper
from pandaserver.brokerage.SiteMapper import SiteMapper

# password
from pandaserver.config import panda_config
passwd = panda_config.dbpasswd

# logger
_logger = PandaLogger().getLogger('prioryMassage')
tmpLog = LogWrapper(_logger)

tmpLog.debug("================= start ==================")

# instantiate TB
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)

# instantiate sitemapper
siteMapper = SiteMapper(taskBuffer)

# get usage breakdown
usageBreakDownPerUser = {}
usageBreakDownPerSite = {}
workingGroupList = []
for table in ['ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsArchived4']:
    varMap = {}
Esempio n. 27
0
 def run(self):
     try:
         # make a message instance
         tmpLog = LogWrapper(_logger, None)
         # run main procedure in the same process
         if not self.forkRun:
             tmpLog.debug('main start')
             tmpLog.debug('firstSubmission={0}'.format(
                 self.firstSubmission))
             # group jobs per VO
             voJobsMap = {}
             ddmFreeJobs = []
             tmpLog.debug('{0} jobs in total'.format(len(self.jobs)))
             for tmpJob in self.jobs:
                 # set VO=local for DDM free
                 if tmpJob.destinationSE == 'local':
                     tmpVO = 'local'
                 else:
                     tmpVO = tmpJob.VO
                 # make map
                 voJobsMap.setdefault(tmpVO, [])
                 voJobsMap[tmpVO].append(tmpJob)
             # loop over all VOs
             for tmpVO in voJobsMap:
                 tmpJobList = voJobsMap[tmpVO]
                 tmpLog.debug('vo={0} has {1} jobs'.format(
                     tmpVO, len(tmpJobList)))
                 # get plugin
                 setupperPluginClass = panda_config.getPlugin(
                     'setupper_plugins', tmpVO)
                 if setupperPluginClass is None:
                     # use ATLAS plug-in by default
                     from pandaserver.dataservice.SetupperAtlasPlugin import SetupperAtlasPlugin
                     setupperPluginClass = SetupperAtlasPlugin
                 tmpLog.debug('plugin name -> {0}'.format(
                     setupperPluginClass.__name__))
                 try:
                     # make plugin
                     setupperPlugin = setupperPluginClass(
                         self.taskBuffer,
                         self.jobs,
                         tmpLog,
                         resubmit=self.resubmit,
                         pandaDDM=self.pandaDDM,
                         ddmAttempt=self.ddmAttempt,
                         onlyTA=self.onlyTA,
                         firstSubmission=self.firstSubmission)
                     # run plugin
                     tmpLog.debug('run plugin')
                     setupperPlugin.run()
                     # go forward if not TA
                     if not self.onlyTA:
                         # update jobs
                         tmpLog.debug('update jobs')
                         self.updateJobs(
                             setupperPlugin.jobs + setupperPlugin.jumboJobs,
                             tmpLog)
                         # execute post process
                         tmpLog.debug('post execute plugin')
                         setupperPlugin.postRun()
                     tmpLog.debug('done plugin')
                 except Exception:
                     errtype, errvalue = sys.exc_info()[:2]
                     tmpLog.error('plugin failed with {0}:{1}'.format(
                         errtype, errvalue))
             tmpLog.debug('main end')
         else:
             tmpLog.debug('fork start')
             # write jobs to file
             import os
             try:
                 import cPickle as pickle
             except ImportError:
                 import pickle
             outFileName = '%s/set.%s_%s' % (panda_config.logdir,
                                             self.jobs[0].PandaID,
                                             str(uuid.uuid4()))
             outFile = open(outFileName, 'wb')
             pickle.dump(self.jobs, outFile, protocol=0)
             outFile.close()
             # run main procedure in another process because python doesn't release memory
             com = 'cd %s > /dev/null 2>&1; export HOME=%s; ' % (
                 panda_config.home_dir_cwd, panda_config.home_dir_cwd)
             com += 'env PYTHONPATH=%s:%s %s/python -Wignore %s/dataservice/forkSetupper.py -i %s' % \
                    (panda_config.pandaCommon_dir,panda_config.pandaPython_dir,panda_config.native_python,
                     panda_config.pandaPython_dir,outFileName)
             if self.onlyTA:
                 com += " -t"
             if not self.firstSubmission:
                 com += " -f"
             tmpLog.debug(com)
             # execute
             status, output = self.taskBuffer.processLimiter.getstatusoutput(
                 com)
             tmpLog.debug("return from main process: %s %s" %
                          (status, output))
             tmpLog.debug('fork end')
     except Exception as e:
         tmpLog.error('master failed with {0} {1}'.format(
             str(e), traceback.format_exc()))
Esempio n. 28
0
from pandaserver.srvcore.CoreUtils import commands_get_status_output

from pandaserver.taskbuffer.TaskBufferInterface import TaskBufferInterface

try:
    long
except NameError:
    long = int

# password
from pandaserver.config import panda_config

# logger
_logger = PandaLogger().getLogger('add')

tmpLog = LogWrapper(_logger, None)

tmpLog.debug("===================== start =====================")

# overall timeout value
overallTimeout = 20

# grace period
try:
    gracePeriod = int(sys.argv[1])
except Exception:
    gracePeriod = 3

# current minute
currentMinute = datetime.datetime.utcnow().minute
Esempio n. 29
0
 def application(environ, start_response):
     # get method name
     methodName = ''
     if 'SCRIPT_NAME' in environ:
         methodName = environ['SCRIPT_NAME'].split('/')[-1]
     tmpLog = LogWrapper(_logger, "PID={0} {1}".format(os.getpid(), methodName), seeMem=True)
     cont_length = int(environ.get('CONTENT_LENGTH', 0))
     json_body = environ.get('CONTENT_TYPE', None) == 'application/json'
     tmpLog.debug("start content-length={} json={}".format(cont_length, json_body))
     regStart = datetime.datetime.utcnow()
     retType = None
     # check method name
     if methodName not in allowedMethods:
         tmpLog.error("is forbidden")
         exeRes = "False : %s is forbidden" % methodName
     else:
         # get method object
         tmpMethod = None
         try:
             tmpMethod = globals()[methodName]
         except Exception:
             pass
         # object not found
         if tmpMethod is None:
             tmpLog.error("is undefined")
             exeRes = "False"
         else:
             body = b''
             try:
                 # dummy request object
                 dummyReq = DummyReq(environ, tmpLog)
                 if not dummyReq.authenticated:
                     start_response('403 Forbidden', [('Content-Type', 'text/plain')])
                     return ["ERROR : Token authentication failed on the server side. {}".format(
                         dummyReq.message).encode()]
                 username = dummyReq.subprocess_env.get('SSL_CLIENT_S_DN', None)
                 if username:
                     username = CoreUtils.clean_user_id(username)
                     if username in ban_user_list:
                         errMsg = '{} is banned'.format(username)
                         tmpLog.warning(errMsg)
                         start_response('403 Forbidden', [('Content-Type', 'text/plain')])
                         return ["ERROR : {}".format(errMsg).encode()]
                 # read contents
                 while cont_length > 0:
                     chunk = environ['wsgi.input'].read(min(cont_length, 1024*1024))
                     if not chunk:
                         break
                     cont_length -= len(chunk)
                     body += chunk
                 if cont_length > 0:
                     raise OSError('partial read from client. {} bytes remaining'.format(cont_length))
                 if not json_body:
                     # query string
                     environ['wsgi.input'] = io.BytesIO(body)
                     # get params
                     tmpPars = cgi.FieldStorage(environ['wsgi.input'], environ=environ,
                                                keep_blank_values=1)
                     # convert to map
                     params = {}
                     for tmpKey in list(tmpPars):
                         if tmpPars[tmpKey].file is not None and tmpPars[tmpKey].filename is not None:
                             # file
                             params[tmpKey] = tmpPars[tmpKey]
                         else:
                             # string
                             params[tmpKey] = tmpPars.getfirst(tmpKey)
                 else:
                     # json
                     body = gzip.decompress(body)
                     params = json.loads(body)
                 if panda_config.entryVerbose:
                     tmpLog.debug("with %s" % str(list(params)))
                 param_list = [dummyReq]
                 # exec
                 exeRes = tmpMethod(*param_list, **params)
                 # extract return type
                 if isinstance(exeRes, dict):
                     retType = exeRes['type']
                     exeRes  = exeRes['content']
                 # convert bool to string
                 if exeRes in [True,False]:
                     exeRes = str(exeRes)
             except Exception as e:
                 tmpLog.error("execution failure : {0}\n {1}".format(str(e), traceback.format_exc()))
                 if hasattr(panda_config, 'dumpBadRequest') and panda_config.dumpBadRequest:
                     try:
                         with tempfile.NamedTemporaryFile(delete=False, prefix='req_dump_') as f:
                             environ['WSGI_INPUT_DUMP'] = f.name
                             f.write(body)
                             os.chmod(f.name, 0o775)
                     except Exception:
                         tmpLog.error(traceback.format_exc())
                         pass
                 errStr = ""
                 for tmpKey in environ:
                     tmpVal = environ[tmpKey]
                     errStr += "%s : %s\n" % (tmpKey,str(tmpVal))
                 tmpLog.error(errStr)
                 # return internal server error
                 start_response('500 INTERNAL SERVER ERROR', [('Content-Type', 'text/plain')])
                 # force kill to release memory
                 if type(e) == OSError:
                     tmpLog.warning('force restart due')
                     os.kill(os.getpid(), signal.SIGINT)
                 return [str(e).encode()]
     if panda_config.entryVerbose:
         tmpLog.debug("done")
     regTime = datetime.datetime.utcnow() - regStart
     tmpLog.info("exec_time=%s.%03d sec, return len=%s B" % (regTime.seconds,
                                                             regTime.microseconds/1000,
                                                             len(str(exeRes))))
     # return
     if exeRes == pandaserver.taskbuffer.ErrorCode.EC_NotFound:
         start_response('404 Not Found', [('Content-Type', 'text/plain')])
         return ['not found'.encode()]
     elif isinstance(exeRes, pandaserver.taskbuffer.ErrorCode.EC_Redirect):
         start_response('302 Redirect', [('Location', exeRes.url)])
         return ['redirect'.encode()]
     else:
         if retType == 'json':
             start_response('200 OK', [('Content-Type', 'application/json')])
         else:
             start_response('200 OK', [('Content-Type', 'text/plain')])
         if isinstance(exeRes, str):
             exeRes = exeRes.encode()
         return [exeRes]