def killJobs(self, ids, code=None, verbose=False): """Kill jobs. Normal users can kill only their own jobs. People with production VOMS role can kill any jobs. Running jobs are killed when next heartbeat comes from the pilot. Set code=9 if running jobs need to be killed immediately. args: ids: the list of PandaIDs code: specify why the jobs are killed 2: expire 3: aborted 4: expire in waiting 7: retry by server 8: rebrokerage 9: force kill 50: kill by JEDI 91: kill user jobs with prod role verbose: set True to see what's going on returns: status code 0: communication succeeded to the panda server 255: communication failure the list of clouds (or Nones if tasks are not yet assigned) """ import userinterface.Client as Client s,o = Client.killJobs(ids, code=code, verbose=verbose)
def killJobs(self, ids, code=None, verbose=False): """Kill jobs. Normal users can kill only their own jobs. People with production VOMS role can kill any jobs. Running jobs are killed when next heartbeat comes from the pilot. Set code=9 if running jobs need to be killed immediately. args: ids: the list of PandaIDs code: specify why the jobs are killed 2: expire 3: aborted 4: expire in waiting 7: retry by server 8: rebrokerage 9: force kill 50: kill by JEDI 91: kill user jobs with prod role verbose: set True to see what's going on returns: status code 0: communication succeeded to the panda server 255: communication failure the list of clouds (or Nones if tasks are not yet assigned) """ import userinterface.Client as Client s, o = Client.killJobs(ids, code=code, verbose=verbose)
def killJobs(jobList): print 'Kill jobs' _logger.debug('Kill jobs') _logger.debug(str(jobList)) s,o = Client.killJobs(jobList) # Code 3 eqs. aborted status _logger.debug(o) _logger.debug(s) _logger.debug("---------------------") return o
def killJobs(jobList): print 'Kill jobs' _logger.debug('Kill jobs') _logger.debug(str(jobList)) s, o = Client.killJobs(jobList) # Code 3 eqs. aborted status _logger.debug(o) _logger.debug(s) _logger.debug("---------------------") return o
def main(): i = 4005758 jobs_list = [] while i <= 4005758: jobs_list.append(i) i+= 1 print jobs_list s,o = Client.killJobs(jobs_list,srvID=aSrvID) for x in o: print x logger.info('done')
# password from config import panda_config passwd = panda_config.dbpasswd cloud = sys.argv[1] # instantiate DB proxies proxyS = DBProxy() proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) while True: # get PandaIDs res = proxyS.querySQL("SELECT PandaID FROM jobsWaiting4 WHERE cloud='%s' ORDER BY PandaID" % cloud) # escape if len(res) == 0: break # convert to list jobs = [] for id, in res: jobs.append(id) # reassign nJob = 300 iJob = 0 while iJob < len(jobs): print 'killJobs(%s)' % jobs[iJob:iJob+nJob] Client.killJobs(jobs[iJob:iJob+nJob]) iJob += nJob time.sleep(60)
jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName
jobs = [] varMap = {} varMap[':prodSourceLabel'] = 'managed' varMap[':taskID'] = args[0] varMap[':pandaIDl'] = args[1] varMap[':pandaIDu'] = args[2] sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND PandaID BETWEEN :pandaIDl AND :pandaIDu ORDER BY PandaID" for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: status,res = proxyS.querySQLS(sql % table,varMap) if res != None: for id, in res: if not id in jobs: jobs.append(id) print 'The number of jobs to be killed : %s' % len(jobs) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'kill %s' % str(jobs[iJob:iJob+nJob]) if options.forceKill: Client.killJobs(jobs[iJob:iJob+nJob],9,useMailAsID=useMailAsIDV) else: Client.killJobs(jobs[iJob:iJob+nJob],useMailAsID=useMailAsIDV) iJob += nJob time.sleep(1)
import subprocess import random from termcolor2 import c import userinterface.Client as Client from taskbuffer.JobSpec import JobSpec from taskbuffer.FileSpec import FileSpec aSrvID = None #for idx,argv in enumerate(sys.argv): # if argv == '-s': # aSrvID = sys.argv[idx+1] # sys.argv = sys.argv[:idx] # break jobs_to_kill = [] for j in sys.argv[1:]: if j.isdigit(): jobs_to_kill.append(j) s, o = Client.killJobs(jobs_to_kill, verbose=False ) print("Job killing results:\n=============================") for i in range(len(jobs_to_kill)): if o[i]: print(c("%s: %s" % (jobs_to_kill[i], 'success' if o[i] else 'failed')).green) else: print(c("%s: %s" % (jobs_to_kill[i], 'success' if o[i] else 'failed')).red)
import sys import userinterface.Client as Client if len(sys.argv) == 2: jobDefIDs = [sys.argv[1]] else: startID = int(sys.argv[1]) endID = int(sys.argv[2]) if startID > endID: print '%d is less than %d' % (endID, startID) sys.exit(1) jobDefIDs = range(startID, endID + 1) # quesry PandaID status, ids = Client.queryPandaIDs(jobDefIDs) if status != 0: sys.exit(0) # remove None while True: if not None in ids: break ids.remove(None) # kill if len(ids) != 0: Client.killJobs(ids)
options,args = optP.parse_args() aSrvID = None codeV = None useMailAsIDV = False if options.forceKill: codeV = 9 elif options.killUserJobs: codeV = 91 else: try: codeV = int(options.codeV) except Exception: pass if options.killOwnProdJobs: useMailAsIDV = True if len(args) == 1: Client.killJobs([args[0]], code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged, jobSubStatus=options.jobSubStatus) else: startID = int(args[0]) endID = int(args[1]) if startID > endID: print '%d is less than %d' % (endID,startID) sys.exit(1) Client.killJobs(range(startID,endID+1),code=codeV,useMailAsID=useMailAsIDV,keepUnmerged=options.keepUnmerged, jobSubStatus=options.jobSubStatus)
sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsDefined4 " else: sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsActive4 " sql += "WHERE jobStatus=:jobStatus AND computingSite=:computingSite AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" status, res = proxyS.querySQLS(sql, varMap) print "got {0} jobs".format(len(res)) jobs = [] jediJobs = [] if res != None: for (id, lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'reassign %s' % str(jobs[iJob:iJob + nJob]) Client.reassignJobs(jobs[iJob:iJob + nJob]) iJob += nJob if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print 'kill JEDI jobs %s' % str(jediJobs[iJob:iJob + nJob]) Client.killJobs(jediJobs[iJob:iJob + nJob], 51) iJob += nJob
import sys import userinterface.Client as Client if len(sys.argv) == 2: jobDefIDs = [sys.argv[1]] else: startID = int(sys.argv[1]) endID = int(sys.argv[2]) if startID > endID: print '%d is less than %d' % (endID,startID) sys.exit(1) jobDefIDs = range(startID,endID+1) # quesry PandaID status, ids = Client.queryPandaIDs(jobDefIDs) if status != 0: sys.exit(0) # remove None while True: if not None in ids: break ids.remove(None) # kill if len(ids) != 0: Client.killJobs(ids)
jobSpecs = taskBuffer.peekJobs(coJumboW,fromDefined=False,fromActive=False,fromArchived=False,fromWaiting=True) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI(jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec],False,True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format(len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob+nJob])) Client.killJobs(jediJobs[iJob:iJob+nJob],51,keepUnmerged=True) iJob += nJob except: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr (threading.Thread): def __init__(self,fileName): threading.Thread.__init__(self) self.fileName = fileName def run(self): setupStr = 'source /etc/sysconfig/panda_server; '
import datetime from taskbuffer.DBProxy import DBProxy import userinterface.Client as Client # password from config import panda_config passwd = panda_config.dbpasswd # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta(days=1) # instantiate DB proxies proxyS = DBProxy() proxyS.connect('adbpro.usatlas.bnl.gov',passwd,'panda-developer','PandaDevDB') # get PandaIDs from jobsDefined res = proxyS.querySQL("SELECT PandaID,modificationTime from jobsDefined4 ORDER BY modificationTime") # kill f old jobs=[] for (id,modTime) in res: if modTime < timeLimit: jobs.append(id) Client.killJobs(jobs)
sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsActive4 " sql += "WHERE jobStatus=:jobStatus AND computingSite=:computingSite AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" status,res = proxyS.querySQLS(sql,varMap) print "got {0} jobs".format(len(res)) jobs = [] jediJobs = [] if res != None: for (id,lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'reassign %s' % str(jobs[iJob:iJob+nJob]) Client.reassignJobs(jobs[iJob:iJob+nJob]) iJob += nJob if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print 'kill JEDI jobs %s' % str(jediJobs[iJob:iJob+nJob]) Client.killJobs(jediJobs[iJob:iJob+nJob],51) iJob += nJob
if options.prodSourceLabel != None: varMap[':src3'] = options.prodSourceLabel srcSQL += ',:src3' srcSQL += ')' jobs = [] tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4'] for table in tables: sql = "SELECT PandaID FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel IN %s " % (table,srcSQL) if options.jobID != None: sql += "AND jobDefinitionID=:jobDefinitionID " if not options.jobsetID in (None,'all'): sql += "AND jobsetID=:jobsetID " sql += "ORDER BY PandaID " status,res = proxyS.querySQLS(sql,varMap) if res != None: for id, in res: if not id in jobs: jobs.append(id) if len(jobs): iJob = 0 nJob = 1000 while iJob < len(jobs): subJobs = jobs[iJob:iJob+nJob] print "kill %s %s/%s" % (str(subJobs),iJob,len(jobs)) Client.killJobs(subJobs,code=9) iJob += nJob else: print "no job was killed"
srcSQL += ')' jobs = [] tables = [ 'ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsWaiting4', 'ATLAS_PANDA.jobsDefined4' ] for table in tables: sql = "SELECT PandaID FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel IN %s " % ( table, srcSQL) if options.jobID != None: sql += "AND jobDefinitionID=:jobDefinitionID " if not options.jobsetID in (None, 'all'): sql += "AND jobsetID=:jobsetID " sql += "ORDER BY PandaID " status, res = proxyS.querySQLS(sql, varMap) if res != None: for id, in res: if not id in jobs: jobs.append(id) if len(jobs): iJob = 0 nJob = 1000 while iJob < len(jobs): subJobs = jobs[iJob:iJob + nJob] print "kill %s %s/%s" % (str(subJobs), iJob, len(jobs)) Client.killJobs(subJobs, code=9) iJob += nJob else: print "no job was killed"
if res != None: for (id,lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) # reassign jobs.sort() if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'reassign %s' % str(jobs[iJob:iJob+nJob]) Client.reassignJobs(jobs[iJob:iJob+nJob]) iJob += nJob time.sleep(10) if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print 'kill JEDI jobs %s' % str(jediJobs[iJob:iJob+nJob]) Client.killJobs(jediJobs[iJob:iJob+nJob],codeV,keepUnmerged=options.keepUnmerged) iJob += nJob print print 'reassigned {0} jobs'.format(len(jobs+jediJobs))
if res != None: for (id, lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) # reassign jobs.sort() if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'reassign %s' % str(jobs[iJob:iJob + nJob]) Client.reassignJobs(jobs[iJob:iJob + nJob]) iJob += nJob time.sleep(10) if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print 'kill JEDI jobs %s' % str(jediJobs[iJob:iJob + nJob]) Client.killJobs(jediJobs[iJob:iJob + nJob], codeV, keepUnmerged=options.keepUnmerged) iJob += nJob print print 'reassigned {0} jobs'.format(len(jobs + jediJobs))
jobsMap[prio] = [] if not id in jobsMap[prio]: jobsMap[prio].append(id) # order by PandaID and currentPriority jobs = [] prioList = jobsMap.keys() prioList.sort() for prio in prioList: # reverse order by PandaID to kill newer jobs ids = jobsMap[prio] ids.sort() ids.reverse() jobs += ids if options.maxJobs != None: jobs = jobs[:int(options.maxJobs)] print 'The number of jobs with priorities below %s : %s' % (args[0], len(jobs)) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'kill %s' % str(jobs[iJob:iJob + nJob]) if options.forceKill: Client.killJobs(jobs[iJob:iJob + nJob], 9) else: Client.killJobs(jobs[iJob:iJob + nJob]) iJob += nJob time.sleep(1)
help='kill user jobs using a production role') options, args = optP.parse_args() aSrvID = None codeV = None useMailAsIDV = False if options.forceKill: codeV = 9 elif options.killUserJobs: codeV = 91 if options.killOwnProdJobs: useMailAsIDV = True if len(args) == 1: Client.killJobs([args[0]], code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged) else: startID = int(args[0]) endID = int(args[1]) if startID > endID: print '%d is less than %d' % (endID, startID) sys.exit(1) Client.killJobs(range(startID, endID + 1), code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged)
default=False,help='kill jobs before next heartbeat is coming') optP.add_option('--killOwnProdJobs',action='store_const',const=True,dest='killOwnProdJobs', default=False,help='kill own production jobs without a production role') optP.add_option('--killUserJobs',action='store_const',const=True,dest='killUserJobs', default=False,help='kill user jobs using a production role') options,args = optP.parse_args() aSrvID = None codeV = None useMailAsIDV = False if options.forceKill: codeV = 9 elif options.killUserJobs: codeV = 91 if options.killOwnProdJobs: useMailAsIDV = True if len(args) == 1: Client.killJobs([args[0]],code=codeV,useMailAsID=useMailAsIDV) else: startID = int(args[0]) endID = int(args[1]) if startID > endID: print '%d is less than %d' % (endID,startID) sys.exit(1) Client.killJobs(range(startID,endID+1),code=codeV,useMailAsID=useMailAsIDV)
sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsWaiting4 WHERE jobStatus=:jobStatus AND taskID=:taskID AND modificationTime<:modificationTime " status, res = proxyS.querySQLS(sql, varMap) if res != None: for (id, lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) # reassign jobs.sort() if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'reassign %s' % str(jobs[iJob:iJob + nJob]) Client.reassignJobs(jobs[iJob:iJob + nJob]) iJob += nJob time.sleep(10) if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print 'kill JEDI jobs %s' % str(jediJobs[iJob:iJob + nJob]) Client.killJobs(jediJobs[iJob:iJob + nJob], codeV) iJob += nJob print print 'reassigned {0} jobs'.format(len(jobs + jediJobs))