Ejemplo n.º 1
0
                    jobSpec)
                if not fileCheckInJEDI:
                    jobSpec.jobStatus = 'closed'
                    jobSpec.jobSubStatus = 'cojumbo_wrong'
                    jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                taskBuffer.archiveJobs([jobSpec], False, True)
        tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format(
            len(coJumboTokill)))
        if len(coJumboTokill) > 0:
            jediJobs = list(coJumboTokill)
            nJob = 100
            iJob = 0
            while iJob < len(jediJobs):
                tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob]))
                Client.killJobs(jediJobs[iJob:iJob + nJob],
                                51,
                                keepUnmerged=True)
                iJob += nJob
except Exception:
    errStr = traceback.format_exc()
    tmpLog.error(errStr)

tmpLog.debug("Fork session")


# thread for fork
class ForkThr(threading.Thread):
    def __init__(self, fileName):
        threading.Thread.__init__(self)
        self.fileName = fileName
Ejemplo n.º 2
0
            if id not in jobsMap[prio]:
                jobsMap[prio].append(id)

# order by PandaID and currentPriority
jobs = []
prioList = list(jobsMap)
prioList.sort()
for prio in prioList:
    # reverse order by PandaID to kill newer jobs
    ids = jobsMap[prio]
    ids.sort()
    ids.reverse()
    jobs += ids

if options.maxJobs is not None:
    jobs = jobs[:int(options.maxJobs)]

print('The number of jobs with priorities below %s : %s' %
      (args[0], len(jobs)))
if len(jobs):
    nJob = 100
    iJob = 0
    while iJob < len(jobs):
        print('kill %s' % str(jobs[iJob:iJob + nJob]))
        if options.forceKill:
            Client.killJobs(jobs[iJob:iJob + nJob], 9)
        else:
            Client.killJobs(jobs[iJob:iJob + nJob])
        iJob += nJob
        time.sleep(1)
Ejemplo n.º 3
0
srcSQL += ')'

jobs = []
tables = [
    'ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsWaiting4',
    'ATLAS_PANDA.jobsDefined4'
]
for table in tables:
    sql = "SELECT PandaID FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel IN %s " % (
        table, srcSQL)
    if options.jobID is not None:
        sql += "AND jobDefinitionID=:jobDefinitionID "
    if not options.jobsetID in (None, 'all'):
        sql += "AND jobsetID=:jobsetID "
    sql += "ORDER BY PandaID "
    status, res = proxyS.querySQLS(sql, varMap)
    if res is not None:
        for id, in res:
            if not id in jobs:
                jobs.append(id)
if len(jobs):
    iJob = 0
    nJob = 1000
    while iJob < len(jobs):
        subJobs = jobs[iJob:iJob + nJob]
        print("kill %s %s/%s" % (str(subJobs), iJob, len(jobs)))
        Client.killJobs(subJobs, code=9)
        iJob += nJob
else:
    print("no job was killed")
Ejemplo n.º 4
0
status, res = proxyS.querySQLS(sql, varMap)
if res is not None:
    for (id, lockedby) in res:
        if lockedby == 'jedi':
            jediJobs.append(id)
        else:
            jobs.append(id)

# reassign
jobs.sort()
if len(jobs):
    nJob = 100
    iJob = 0
    while iJob < len(jobs):
        print('reassign  %s' % str(jobs[iJob:iJob + nJob]))
        Client.reassignJobs(jobs[iJob:iJob + nJob])
        iJob += nJob
        time.sleep(10)

if len(jediJobs) != 0:
    nJob = 100
    iJob = 0
    while iJob < len(jediJobs):
        print('kill JEDI jobs %s' % str(jediJobs[iJob:iJob + nJob]))
        Client.killJobs(jediJobs[iJob:iJob + nJob],
                        codeV,
                        keepUnmerged=options.keepUnmerged)
        iJob += nJob

print('\nreassigned {0} jobs'.format(len(jobs + jediJobs)))
Ejemplo n.º 5
0
proxyS = DBProxy()
proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname)

jobs = []

varMap = {}
varMap[':prodSourceLabel']  = 'managed'
varMap[':taskID']   = args[0]
varMap[':pandaIDl'] = args[1]
varMap[':pandaIDu'] = args[2]
sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND PandaID BETWEEN :pandaIDl AND :pandaIDu ORDER BY PandaID"
for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']:
    status,res = proxyS.querySQLS(sql % table,varMap)
    if res is not None:
        for id, in res:
            if not id in jobs:
                jobs.append(id)

print('The number of jobs to be killed : %s' % len(jobs))
if len(jobs):
    nJob = 100
    iJob = 0
    while iJob < len(jobs):
        print('kill %s' % str(jobs[iJob:iJob+nJob]))
        if options.forceKill:
            Client.killJobs(jobs[iJob:iJob+nJob],9,useMailAsID=useMailAsIDV)
        else:
            Client.killJobs(jobs[iJob:iJob+nJob],useMailAsID=useMailAsIDV)
        iJob += nJob
        time.sleep(1)
Ejemplo n.º 6
0
    sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsDefined4 "
else:
    sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsActive4 "
sql += "WHERE jobStatus=:jobStatus AND computingSite=:computingSite AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID"
status, res = proxyS.querySQLS(sql, varMap)

print("got {0} jobs".format(len(res)))

jobs = []
jediJobs = []
if res is not None:
    for (id, lockedby) in res:
        if lockedby == 'jedi':
            jediJobs.append(id)
        else:
            jobs.append(id)
if len(jobs):
    nJob = 100
    iJob = 0
    while iJob < len(jobs):
        print('reassign  %s' % str(jobs[iJob:iJob + nJob]))
        Client.reassignJobs(jobs[iJob:iJob + nJob])
        iJob += nJob
if len(jediJobs) != 0:
    nJob = 100
    iJob = 0
    while iJob < len(jediJobs):
        print('kill JEDI jobs %s' % str(jediJobs[iJob:iJob + nJob]))
        Client.killJobs(jediJobs[iJob:iJob + nJob], 51)
        iJob += nJob
Ejemplo n.º 7
0
if options.forceKill:
    codeV = 9
elif options.killUserJobs:
    codeV = 91
else:
    try:
        codeV = int(options.codeV)
    except Exception:
        pass
if options.killOwnProdJobs:
    useMailAsIDV = True

if len(args) == 1:
    Client.killJobs([args[0]],
                    code=codeV,
                    useMailAsID=useMailAsIDV,
                    keepUnmerged=options.keepUnmerged,
                    jobSubStatus=options.jobSubStatus)
else:
    startID = int(args[0])
    endID = int(args[1])
    if startID > endID:
        print('%d is less than %d' % (endID, startID))
        sys.exit(1)
    Client.killJobs(range(startID, endID + 1),
                    code=codeV,
                    useMailAsID=useMailAsIDV,
                    keepUnmerged=options.keepUnmerged,
                    jobSubStatus=options.jobSubStatus)
Ejemplo n.º 8
0
import sys

import pandaserver.userinterface.Client as Client

if len(sys.argv) == 2:
    jobDefIDs = [sys.argv[1]]
else:
    startID = int(sys.argv[1])
    endID   = int(sys.argv[2])
    if startID > endID:
        print('%d is less than %d' % (endID,startID))
        sys.exit(1)
    jobDefIDs = range(startID,endID+1)

# quesry PandaID
status, ids = Client.queryPandaIDs(jobDefIDs)

if status != 0:
    sys.exit(0)

# remove None
while True:
    if None not in ids:
        break
    ids.remove(None)

# kill
if len(ids) != 0:
    Client.killJobs(ids)
Ejemplo n.º 9
0
def main(argv=tuple(), tbuf=None, **kwargs):

    try:
        long
    except NameError:
        long = int

    tmpLog = LogWrapper(_logger, None)

    tmpLog.debug("===================== start =====================")

    # current minute
    currentMinute = datetime.datetime.utcnow().minute

    # instantiate TB
    if tbuf is None:
        from pandaserver.taskbuffer.TaskBuffer import taskBuffer
        taskBuffer.init(panda_config.dbhost,
                        panda_config.dbpasswd,
                        nDBConnection=1)
    else:
        taskBuffer = tbuf

    # instantiate sitemapper
    aSiteMapper = SiteMapper(taskBuffer)

    # delete
    tmpLog.debug("Del session")
    status, retSel = taskBuffer.querySQLS(
        "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {})
    if retSel is not None:
        try:
            maxID = retSel[0][0]
            tmpLog.debug("maxID : %s" % maxID)
            if maxID is not None:
                varMap = {}
                varMap[':maxID'] = maxID
                varMap[':jobStatus1'] = 'activated'
                varMap[':jobStatus2'] = 'waiting'
                varMap[':jobStatus3'] = 'failed'
                varMap[':jobStatus4'] = 'cancelled'
                status, retDel = taskBuffer.querySQLS(
                    "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)",
                    varMap)
        except Exception:
            pass

    # count # of getJob/updateJob in dispatcher's log
    try:
        # don't update when logrotate is running
        timeNow = datetime.datetime.utcnow()
        logRotateTime = timeNow.replace(hour=3,
                                        minute=2,
                                        second=0,
                                        microsecond=0)
        if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \
               (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)):
            tmpLog.debug("skip pilotCounts session for logrotate")
        else:
            # log filename
            dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir
            # time limit
            timeLimit = datetime.datetime.utcnow() - datetime.timedelta(
                hours=3)
            timeLimitS = datetime.datetime.utcnow() - datetime.timedelta(
                hours=1)
            # check if tgz is required
            com = 'head -1 %s' % dispLogName
            lostat, loout = commands_get_status_output(com)
            useLogTgz = True
            if lostat == 0:
                match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}',
                                  loout)
                if match is not None:
                    startTime = datetime.datetime(*time.strptime(
                        match.group(0), '%Y-%m-%d %H:%M:%S')[:6])
                    # current log contains all info
                    if startTime < timeLimit:
                        useLogTgz = False
            # log files
            dispLogNameList = [dispLogName]
            if useLogTgz:
                today = datetime.date.today()
                dispLogNameList.append('{0}-{1}.gz'.format(
                    dispLogName, today.strftime('%Y%m%d')))
            # delete tmp
            commands_get_status_output('rm -f %s.tmp-*' % dispLogName)
            # tmp name
            tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow(
            ).strftime('%Y-%m-%d-%H-%M-%S'))
            # loop over all files
            pilotCounts = {}
            pilotCountsS = {}
            for tmpDispLogName in dispLogNameList:
                # expand or copy
                if tmpDispLogName.endswith('.gz'):
                    com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName)
                else:
                    com = 'cp %s %s' % (tmpDispLogName, tmpLogName)
                lostat, loout = commands_get_status_output(com)
                if lostat != 0:
                    errMsg = 'failed to expand/copy %s with : %s' % (
                        tmpDispLogName, loout)
                    raise RuntimeError(errMsg)
                # search string
                sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*'
                sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)'
                # read
                logFH = open(tmpLogName)
                for line in logFH:
                    # check format
                    match = re.search(sStr, line)
                    if match is not None:
                        # check timerange
                        timeStamp = datetime.datetime(*time.strptime(
                            match.group(1), '%Y-%m-%d %H:%M:%S')[:6])
                        if timeStamp < timeLimit:
                            continue
                        tmpMethod = match.group(2)
                        tmpSite = match.group(3)
                        tmpNode = match.group(4)
                        tmpType = match.group(5)

                        # protection against corrupted entries from pilot,
                        # e.g. pilot reading site json from cvmfs while it was being updated
                        if tmpSite not in aSiteMapper.siteSpecList:
                            continue
                        # sum
                        pilotCounts.setdefault(tmpSite, {})
                        pilotCounts[tmpSite].setdefault(tmpMethod, {})
                        pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0)
                        pilotCounts[tmpSite][tmpMethod][tmpNode] += 1
                        # short
                        if timeStamp > timeLimitS:
                            if tmpSite not in pilotCountsS:
                                pilotCountsS[tmpSite] = dict()
                            if tmpMethod not in pilotCountsS[tmpSite]:
                                pilotCountsS[tmpSite][tmpMethod] = dict()
                            if tmpNode not in pilotCountsS[tmpSite][tmpMethod]:
                                pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0
                            pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1
                # close
                logFH.close()
            # delete tmp
            commands_get_status_output('rm %s' % tmpLogName)
            # update
            hostID = panda_config.pserverhost.split('.')[0]
            tmpLog.debug("pilotCounts session")
            retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3)
            tmpLog.debug(retPC)
            retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1)
            tmpLog.debug(retPC)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue))

    # nRunning
    tmpLog.debug("nRunning session")
    try:
        if (currentMinute / panda_config.nrun_interval
            ) % panda_config.nrun_hosts == panda_config.nrun_snum:
            retNR = taskBuffer.insertnRunningInSiteData()
            tmpLog.debug(retNR)
    except Exception:
        errType, errValue = sys.exc_info()[:2]
        tmpLog.error("nRunning : %s %s" % (errType, errValue))

    # session for co-jumbo jobs
    tmpLog.debug("co-jumbo session")
    try:
        ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000)
        if ret is None:
            tmpLog.debug("failed to get co-jumbo jobs to finish")
        else:
            coJumboA, coJumboD, coJumboW, coJumboTokill = ret
            tmpLog.debug("finish {0} co-jumbo jobs in Active".format(
                len(coJumboA)))
            if len(coJumboA) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboA,
                                               fromDefined=False,
                                               fromActive=True,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False)
            tmpLog.debug("finish {0} co-jumbo jobs in Defined".format(
                len(coJumboD)))
            if len(coJumboD) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboD,
                                               fromDefined=True,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=False)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], True)
            tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format(
                len(coJumboW)))
            if len(coJumboW) > 0:
                jobSpecs = taskBuffer.peekJobs(coJumboW,
                                               fromDefined=False,
                                               fromActive=False,
                                               fromArchived=False,
                                               fromWaiting=True)
                for jobSpec in jobSpecs:
                    fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(
                        jobSpec)
                    if not fileCheckInJEDI:
                        jobSpec.jobStatus = 'closed'
                        jobSpec.jobSubStatus = 'cojumbo_wrong'
                        jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn
                    taskBuffer.archiveJobs([jobSpec], False, True)
            tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format(
                len(coJumboTokill)))
            if len(coJumboTokill) > 0:
                jediJobs = list(coJumboTokill)
                nJob = 100
                iJob = 0
                while iJob < len(jediJobs):
                    tmpLog.debug(' killing %s' %
                                 str(jediJobs[iJob:iJob + nJob]))
                    Client.killJobs(jediJobs[iJob:iJob + nJob],
                                    51,
                                    keepUnmerged=True)
                    iJob += nJob
    except Exception:
        errStr = traceback.format_exc()
        tmpLog.error(errStr)

    tmpLog.debug("Fork session")

    # thread for fork
    class ForkThr(threading.Thread):
        def __init__(self, fileName):
            threading.Thread.__init__(self)
            self.fileName = fileName

        def run(self):
            if 'VIRTUAL_ENV' in os.environ:
                prefix = os.environ['VIRTUAL_ENV']
            else:
                prefix = ''
            setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix)
            runStr = '%s/python -Wignore ' % panda_config.native_python
            runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i '
            runStr += self.fileName
            if self.fileName.split('/')[-1].startswith('set.NULL.'):
                runStr += ' -t'
            comStr = setupStr + runStr
            tmpLog.debug(comStr)
            commands_get_status_output(comStr)

    # get set.* files
    filePatt = panda_config.logdir + '/' + 'set.*'
    fileList = glob.glob(filePatt)

    # the max number of threads
    maxThr = 10
    nThr = 0

    # loop over all files
    forkThrList = []
    timeNow = datetime.datetime.utcnow()
    for tmpName in fileList:
        if not os.path.exists(tmpName):
            continue
        try:
            # takes care of only recent files
            modTime = datetime.datetime(
                *(time.gmtime(os.path.getmtime(tmpName))[:7]))
            if (timeNow - modTime) > datetime.timedelta(minutes=1) and \
                    (timeNow - modTime) < datetime.timedelta(hours=1):
                cSt, cOut = commands_get_status_output(
                    'ps aux | grep fork | grep -v PYTH')
                # if no process is running for the file
                if cSt == 0 and tmpName not in cOut:
                    nThr += 1
                    thr = ForkThr(tmpName)
                    thr.start()
                    forkThrList.append(thr)
                    if nThr > maxThr:
                        break
        except Exception:
            errType, errValue = sys.exc_info()[:2]
            tmpLog.error("%s %s" % (errType, errValue))

    # join fork threads
    for thr in forkThrList:
        thr.join()

    # terminate TaskBuffer IF
    # taskBufferIF.terminate()

    tmpLog.debug("===================== end =====================")