def run(self): self.lock.acquire() try: # loop over all datasets for vuid,name,modDate in self.datasets: # only dis datasets if re.search('_dis\d+$',name) is None: _logger.error("Eraser : non disDS %s" % name) continue # delete _logger.debug("Eraser %s dis %s %s" % (self.operationType,modDate,name)) # delete or shorten endStatus = 'deleted' status,out = rucioAPI.eraseDataset(name) if not status: _logger.error(out) continue _logger.debug('OK with %s' % name) # update self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':status'] = endStatus taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", varMap) self.proxyLock.release() except Exception: errStr = traceback.format_exc() _logger.error(errStr) self.pool.remove(self) self.lock.release()
def main(tbuf=None, **kwargs): # logger tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) else: taskBuffer = tbuf # instantiate MyProxy I/F my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface() # roles if hasattr(panda_config,'proxy_cache_roles'): roles = panda_config.proxy_cache_roles.split(',') else: roles = ['atlas','atlas:/atlas/Role=production','atlas:/atlas/Role=pilot'] # get users sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt' varMap = {} varMap[':patt'] = '%p%' tmpStat,tmpRes = taskBuffer.querySQLS(sql,varMap) for realDN, in tmpRes: if realDN is None: continue realDN = CoreUtils.get_bare_dn(realDN, keep_digits=False) name = taskBuffer.cleanUserID(realDN) # check proxy tmpLog.debug("check proxy cache for {}".format(name)) for role in roles: my_proxy_interface_instance.checkProxy(realDN, role=role, name=name) tmpLog.debug("done")
def get_task_attribute_map(task_id_list): var_map = {} for i, task_id in enumerate(task_id_list): var_map[':task_id{0}'.format(i)] = task_id task_id_bindings = ','.join(':task_id{0}'.format(i) for i in range(len(task_id_list))) sql = """ SELECT jeditaskid, prodsourcelabel, gshare FROM ATLAS_PANDA.jedi_tasks WHERE jeditaskid IN({0}) """.format(task_id_bindings) _logger.debug('sql: {0}'.format(sql)) _logger.debug('task_id_bindings: {0}'.format(task_id_bindings)) var_map status, ret_sel = taskBuffer.querySQLS(sql, var_map) task_pslabel_map = {} task_gshare_map = {} if ret_sel: _logger.debug('ret_sel: {0}'.format(ret_sel)) for jeditaskid, prodsourcelabel, gshare in ret_sel: task_pslabel_map[jeditaskid] = prodsourcelabel task_gshare_map[jeditaskid] = gshare return task_pslabel_map, task_gshare_map
tmpLog.debug("old process : %s %s" % (pid, startTime)) tmpLog.debug(line) commands_get_status_output('kill -9 %s' % pid) except Exception: type, value, traceBack = sys.exc_info() tmpLog.error("kill process : %s %s" % (type, value)) # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # delete tmpLog.debug("Del session") status, retSel = taskBuffer.querySQLS( "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {}) if retSel is not None: try: maxID = retSel[0][0] tmpLog.debug("maxID : %s" % maxID) if maxID is not None: varMap = {} varMap[':maxID'] = maxID varMap[':jobStatus1'] = 'activated' varMap[':jobStatus2'] = 'waiting' varMap[':jobStatus3'] = 'failed' varMap[':jobStatus4'] = 'cancelled' status, retDel = taskBuffer.querySQLS( "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)", varMap) except Exception:
usageBreakDownPerUser = {} usageBreakDownPerSite = {} workingGroupList = [] for table in ['ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsArchived4']: varMap = {} varMap[':prodSourceLabel'] = 'user' varMap[':pmerge'] = 'pmerge' if table == 'ATLAS_PANDA.jobsActive4': sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND processingType<>:pmerge GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table else: # with time range for archived table varMap[':modificationTime'] = datetime.datetime.utcnow( ) - datetime.timedelta(minutes=60) sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND processingType<>:pmerge AND modificationTime>:modificationTime GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table # exec status, res = taskBuffer.querySQLS(sql, varMap, arraySize=10000) if res is None: tmpLog.debug("total %s " % res) else: tmpLog.debug("total %s " % len(res)) # make map for cnt, prodUserName, jobStatus, workingGroup, computingSite in res: # append to PerUser map usageBreakDownPerUser.setdefault(prodUserName, {}) usageBreakDownPerUser[prodUserName].setdefault(workingGroup, {}) usageBreakDownPerUser[prodUserName][workingGroup].setdefault( computingSite, { 'rundone': 0, 'activated': 0, 'running': 0 })
def run(self): self.lock.acquire() try: for vuid,name,modDate in self.datasets: _logger.debug("Freezer start %s %s" % (modDate,name)) self.proxyLock.acquire() retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID,status FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock ", {':destinationDBlock':name}) self.proxyLock.release() if retF < 0: _logger.error("SQL error") else: allFinished = True onePandaID = None for tmpPandaID,tmpFileStatus in resF: onePandaID = tmpPandaID if not tmpFileStatus in ['ready', 'failed', 'skipped', 'merging', 'finished']: allFinished = False break # check sub datasets in the jobset for event service job if allFinished: self.proxyLock.acquire() tmpJobs = taskBuffer.getFullJobStatus([onePandaID]) self.proxyLock.release() if len(tmpJobs) > 0 and tmpJobs[0] is not None: if EventServiceUtils.isEventServiceMerge(tmpJobs[0]): self.proxyLock.acquire() cThr = Closer(taskBuffer, [], tmpJobs[0]) allFinished = cThr.checkSubDatasetsInJobset() self.proxyLock.release() _logger.debug("closer checked sub datasets in the jobset for %s : %s" % (name, allFinished)) # no files in filesTable if allFinished: _logger.debug("freeze %s " % name) dsExists = True if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \ or name.startswith('hc_test.') or name.startswith('panda.um.'): dsExists = False if name.startswith('panda.um.'): self.proxyLock.acquire() retMer,resMer = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock AND status IN (:statusM,:statusF) ", {':destinationDBlock':name, ':statusM':'merging', ':statusF':'failed'}) self.proxyLock.release() if resMer is not None and len(resMer)>0: mergeID = resMer[0][0] # get merging jobs self.proxyLock.acquire() mergingJobs = taskBuffer.peekJobs([mergeID],fromDefined=False,fromArchived=False,fromWaiting=False) self.proxyLock.release() mergeJob = mergingJobs[0] if mergeJob is not None: tmpDestDBlocks = [] # get destDBlock for tmpFile in mergeJob.Files: if tmpFile.type in ['output','log']: if not tmpFile.destinationDBlock in tmpDestDBlocks: tmpDestDBlocks.append(tmpFile.destinationDBlock) # run _logger.debug("start JEDI closer for %s " % name) self.proxyLock.acquire() cThr = Closer(taskBuffer,tmpDestDBlocks,mergeJob) cThr.start() cThr.join() self.proxyLock.release() _logger.debug("end JEDI closer for %s " % name) continue else: _logger.debug("failed to get merging job for %s " % name) else: _logger.debug("failed to get merging file for %s " % name) status,out = True,'' elif dsExists: # check if dataset exists status,out = rucioAPI.getMetaData(name) if status == True: if out is not None: try: rucioAPI.closeDataset(name) status = True except Exception: errtype,errvalue = sys.exc_info()[:2] out = 'failed to freeze : {0} {1}'.format(errtype,errvalue) status = False else: # dataset not exist status,out = True,'' dsExists = False else: status,out = True,'' if not status: _logger.error('{0} failed to freeze with {1}'.format(name,out)) else: self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':status'] = 'completed' taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:status,modificationdate=CURRENT_DATE WHERE vuid=:vuid", varMap) self.proxyLock.release() if name.startswith('pandaddm_') or name.startswith('panda.um.') or not dsExists: continue # set tobedeleted to dis setTobeDeletedToDis(name) # count # of files status,out = rucioAPI.getNumberOfFiles(name) if status is not True: if status is False: _logger.error(out) else: _logger.debug(out) try: nFile = int(out) _logger.debug(nFile) if nFile == 0: # erase dataset _logger.debug('erase %s' % name) status,out = rucioAPI.eraseDataset(name) _logger.debug('OK with %s' % name) except Exception: pass else: _logger.debug("wait %s " % name) self.proxyLock.acquire() taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET modificationdate=CURRENT_DATE WHERE vuid=:vuid", {':vuid':vuid}) self.proxyLock.release() _logger.debug("end %s " % name) except Exception: errStr = traceback.format_exc() _logger.error(errStr) self.pool.remove(self) self.lock.release()
def run(self): self.lock.acquire() try: # loop over all datasets for vuid,name,modDate in self.datasets: _logger.debug("Close %s %s" % (modDate,name)) dsExists = True if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \ or name.startswith('hc_test.') or name.startswith('panda.um.'): dsExists = False if dsExists: # check if dataset exists status,out = rucioAPI.getMetaData(name) if status == True: if out is not None: try: rucioAPI.closeDataset(name) status = True except Exception: errtype,errvalue = sys.exc_info()[:2] out = 'failed to freeze : {0} {1}'.format(errtype,errvalue) status = False else: # dataset not exist status,out = True,'' dsExists = False else: status,out = True,'' if not status: _logger.error('{0} failed to close with {1}'.format(name,out)) else: self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':newstatus'] = 'completed' varMap[':oldstatus'] = 'tobeclosed' taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:newstatus,modificationdate=CURRENT_DATE WHERE vuid=:vuid AND status=:oldstatus", varMap) self.proxyLock.release() # set tobedeleted to dis setTobeDeletedToDis(name) # skip if dataset is not real if not dsExists: continue # count # of files status,out = rucioAPI.getNumberOfFiles(name) if status is not True: if status is False: _logger.error(out) else: _logger.debug(out) try: nFile = int(out) if nFile == 0: # erase dataset _logger.debug('erase %s' % name) status,out = rucioAPI.eraseDataset(name) _logger.debug('OK with %s' % name) except Exception: pass except Exception: pass self.pool.remove(self) self.lock.release()
def run(self): self.lock.acquire() try: for vuid,name,modDate in self.datasets: # check just in case if re.search('_sub\d+$',name) is None: _logger.debug("skip non sub %s" % name) continue _logger.debug("delete sub %s" % name) if name.startswith('pandaddm_') or name.startswith('user.') or name.startswith('group.') \ or name.startswith('hc_test.') or name.startswith('panda.um.'): dsExists = False else: dsExists = True # get PandaIDs self.proxyLock.acquire() retF,resF = taskBuffer.querySQLS("SELECT /*+ index(tab FILESTABLE4_DESTDBLOCK_IDX) */ DISTINCT PandaID FROM ATLAS_PANDA.filesTable4 tab WHERE destinationDBlock=:destinationDBlock ", {':destinationDBlock':name}) self.proxyLock.release() if retF is None: _logger.error("SQL error for sub {0}".format(name)) continue else: _logger.debug("sub {0} has {1} jobs".format(name,len(resF))) self.proxyLock.acquire() # check jobs sqlP = "SELECT jobStatus FROM ATLAS_PANDA.jobsArchived4 WHERE PandaID=:PandaID " sqlP += "UNION " sqlP += "SELECT jobStatus FROM ATLAS_PANDAARCH.jobsArchived WHERE PandaID=:PandaID AND modificationTime>CURRENT_DATE-30 " allDone = True for pandaID, in resF: retP,resP = taskBuffer.querySQLS(sqlP, {':PandaID':pandaID}) if len(resP) == 0: _logger.debug("skip delete sub {0} PandaID={1} not found".format(name,pandaID)) allDone = False break jobStatus = resP[0][0] if jobStatus not in ['finished','failed','cancelled','closed']: _logger.debug("skip delete sub {0} PandaID={1} is active {2}".format(name,pandaID,jobStatus)) allDone = False break self.proxyLock.release() if allDone: _logger.debug("deleting sub %s" % name) try: rucioAPI.eraseDataset(name, grace_period=4) status = True except Exception: errtype,errvalue = sys.exc_info()[:2] out = '{0} {1}'.format(errtype,errvalue) _logger.error('{0} failed to erase with {1}'.format(name,out)) else: _logger.debug("wait sub %s" % name) continue # update dataset self.proxyLock.acquire() varMap = {} varMap[':vuid'] = vuid varMap[':ost1'] = 'completed' varMap[':ost2'] = 'cleanup' varMap[':newStatus'] = 'deleted' taskBuffer.querySQLS("UPDATE ATLAS_PANDA.Datasets SET status=:newStatus,modificationdate=CURRENT_DATE WHERE vuid=:vuid AND status IN (:ost1,:ost2) ", varMap) self.proxyLock.release() _logger.debug("end %s " % name) except Exception: errStr = traceback.format_exc() _logger.error(errStr) self.pool.remove(self) self.lock.release()
# freeze dataset _logger.debug("==== freeze datasets ====") timeLimitRU = datetime.datetime.utcnow() - datetime.timedelta(hours=3) timeLimitRL = datetime.datetime.utcnow() - datetime.timedelta(hours=12) timeLimitU = datetime.datetime.utcnow() - datetime.timedelta(hours=6) timeLimitL = datetime.datetime.utcnow() - datetime.timedelta(days=14) # reset doing so that Closer can update unmerged datasets sql = "SELECT name FROM ATLAS_PANDA.Datasets " sql += "WHERE type=:type AND (modificationdate BETWEEN :modificationdateRL AND :modificationdateRU) AND subType=:subType AND status=:oldStatus " varMap = {} varMap[':modificationdateRU'] = timeLimitRU varMap[':modificationdateRL'] = timeLimitRL varMap[':type'] = 'output' varMap[':subType'] = 'sub' varMap[':oldStatus'] = 'doing' retReset,resReset = taskBuffer.querySQLS(sql,varMap) sql = "UPDATE ATLAS_PANDA.Datasets SET status=:newStatus,modificationdate=:modificationdateU WHERE name=:name AND status=:oldStatus " if resReset is not None: for name, in resReset: varMap = {} varMap[':name'] = name varMap[':oldStatus'] = 'doing' varMap[':newStatus'] = 'running' varMap[':modificationdateU'] = timeLimitU _logger.debug("reset {0} to freeze".format(name)) taskBuffer.querySQLS(sql,varMap) # loop for freezer freezeLock = threading.Semaphore(5) freezeProxyLock = threading.Lock() freezeThreadPool = ThreadPool() maxRows = 100000
action='store_const', const=True, dest='resurrectDS', default=False, help='resurrect output and log datasets if they were already deleted') options = parser.parse_args() jediTaskID = int(options.tid) if True: if options.resurrectDS: sd, so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', { ':id': jediTaskID, ':t1': 'output', ':t2': 'log' }) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print('resurrect {0}'.format(datasetName)) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None)
if options.files is not None: files = options.files.split(',') else: # get files from rucio rc = RucioClient() scope, name = rucioAPI.extract_scope(options.ds) files_rucio = set() for i in rc.list_files(scope, name): files_rucio.add(i['name']) # get files from panda dsName = options.ds.split(':')[-1] fd, fo = taskBuffer.querySQLS( 'SELECT c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c WHERE d.jediTaskID=c.jediTaskID AND d.datasetID=c.datasetID AND d.type IN (:t1,:t2) AND c.status=:s AND d.datasetName=:name ', { ':s': 'finished', ':t1': 'output', ':t2': 'log', ':name': dsName }) files = [] for tmpLFN, in fo: if tmpLFN not in files_rucio: files.append(tmpLFN) print('') print('found {0} lost files -> {1}'.format(len(files), ','.join(files))) s, jediTaskID = taskBuffer.resetFileStatusInJEDI('', True, options.ds, files, [], options.dryRun) if options.dryRun: sys.exit(0) if s:
from pandaserver.taskbuffer.TaskBuffer import taskBuffer from pandacommon.pandalogger.PandaLogger import PandaLogger # initialize cx_Oracle using dummy connection initializer.init() # logger _logger = PandaLogger().getLogger('boostUser') _logger.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) user = sys.stdin.read() user = user[:-1] sql = "UPDATE atlas_panda.%s set currentPriority=:prio where prodUserName=:uname and prodSourceLabel IN (:label1,:label2) and currentPriority<:prio" varMap = {} varMap[':prio'] = 4000 varMap[':uname'] = user varMap[':label1'] = 'user' varMap[':label2'] = 'panda' for table in ('jobsactive4','jobsdefined4'): _logger.debug((sql % table) + str(varMap)) ret = taskBuffer.querySQLS(sql % table,varMap) _logger.debug('ret -> %s' % str(ret)) _logger.debug("================= end ==================")
def main(taskBuffer=None, exec_options=None, log_stream=None, args_list=None): # options parser = argparse.ArgumentParser() if taskBuffer: parser.add_argument('--ds',action='store',dest='ds',default=None, help='dataset name') else: parser.add_argument('--ds',action='store',dest='ds',default=None,required=True, help='dataset name') parser.add_argument('--files',action='store',dest='files',default=None, help='comma-separated list of lost file names. The list is dedeuced if this option is omitted') parser.add_argument('--noChildRetry',action='store_const',const=True,dest='noChildRetry',default=False, help='not retry child tasks') parser.add_argument('--resurrectDS',action='store_const',const=True,dest='resurrectDS',default=False, help='resurrect output and log datasets if they were already deleted') parser.add_argument('--dryRun',action='store_const',const=True,dest='dryRun',default=False, help='dry run') parser.add_argument('--force', action='store_const', const=True, dest='force', default=False, help='force retry even if no lost files') parser.add_argument('--reproduceParent', action='store_const', const=True, dest='reproduceParent', default=False, help='reproduce the input files from which the lost files were produced. ' 'Typically useful to recover merged files when unmerged files were already deleted') # parse options if taskBuffer: if args_list: options = parser.parse_args(args_list) else: options, unknown = parser.parse_known_args() else: if args_list: options = parser.parse_args(args_list) else: options = parser.parse_args() # executed via command-line givenTaskID = None dn = None if taskBuffer is None: # instantiate TB from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) else: # set options from dict if exec_options is None: exec_options = {} keys = set(vars(options).keys()) for k in exec_options: if k in keys: setattr(options, k, exec_options[k]) if 'jediTaskID' in exec_options: givenTaskID = exec_options['jediTaskID'] if 'userName' in exec_options: dn = exec_options['userName'] ds_files = {} if options.files is not None: files = options.files.split(',') ds_files[options.ds] = files else: # look for lost files if not givenTaskID: # get files from rucio st, files_rucio = get_files_from_rucio(options.ds, log_stream) if st is not True: return st, files_rucio # get files from panda dsName = options.ds.split(':')[-1] fd, fo = taskBuffer.querySQLS( 'SELECT c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c ' 'WHERE c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND ' 'd.type IN (:t1,:t2) AND c.status=:s AND d.datasetName=:name ', {':s': 'finished', ':t1': 'output', ':t2': 'log', ':name': dsName}) for tmpLFN, in fo: if tmpLFN not in files_rucio: ds_files.setdefault(options.ds, []) ds_files[options.ds].append(tmpLFN) # get taskID td, to = taskBuffer.querySQLS( 'SELECT jediTaskID FROM ATLAS_PANDA.JEDI_Datasets ' 'WHERE datasetName=:datasetName AND type IN (:t1,:t2) ', {':t1': 'output', ':t2': 'log', ':datasetName': dsName}) jediTaskID, = to[0] else: # get dataset names dd, do = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets ' 'WHERE jediTaskID=:jediTaskID AND type IN (:t1,:t2) ', {':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID}) # get files from rucio files_rucio = set() for tmpDS, in do: st, tmp_files_rucio = get_files_from_rucio(tmpDS, log_stream) if st is None: return st, tmp_files_rucio # ignore unknown dataset if st: files_rucio = files_rucio.union(tmp_files_rucio) # get files from rucio fd, fo = taskBuffer.querySQLS( 'SELECT d.datasetName,c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c ' 'WHERE d.jediTaskID=:jediTaskID AND c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND ' 'd.type IN (:t1,:t2) AND c.status=:s ', {':s': 'finished', ':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID}) for tmpDS, tmpLFN in fo: if tmpLFN not in files_rucio: ds_files.setdefault(tmpDS, []) ds_files[tmpDS].append(tmpLFN) for tmpDS in ds_files: files = ds_files[tmpDS] msgStr = '{} has {} lost files -> {}'.format(tmpDS, len(files), ','.join(files)) if log_stream: log_stream.info(msgStr) else: print(msgStr) # no lost files if not ds_files and not options.force: return True, "No lost files. Use --force to ignore this check" # reset file status s = False for tmpDS in ds_files: files = ds_files[tmpDS] if dn: ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI(dn, False, tmpDS, files, options.reproduceParent, options.dryRun) else: ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI('', True, tmpDS, files, options.reproduceParent, options.dryRun) msgStr = 'reset file status for {} in the DB: done with {} for jediTaskID={}'.format(tmpDS, ts, jediTaskID) if log_stream: log_stream.info(msgStr) else: print(msgStr) s |= ts # recover parent if options.reproduceParent: # reproduce input for lostDS in lostInputFiles: com_args = ['--ds', lostDS, '--noChildRetry', '--resurrectDS'] if options.dryRun: com_args.append('--dryRun') com_args += ['--files', ','.join(lostInputFiles[lostDS])] main(taskBuffer=taskBuffer, log_stream=log_stream, args_list=com_args) # go ahead if options.dryRun: return True, 'Done in the dry-run mode with {}'.format(s) if s or options.force: if options.resurrectDS: sd,so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', {':id': jediTaskID, ':t1': 'output', ':t2': 'log'}) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print('resurrect {0}'.format(datasetName)) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except Exception: pass if not options.reproduceParent: msgStr = Client.retryTask(jediTaskID, noChildRetry=options.noChildRetry)[-1][-1] else: msgStr = Client.reloadInput(jediTaskID)[-1][-1] if log_stream: log_stream.info("Retried task with {}".format(msgStr)) log_stream.info("Done") else: print("Retried task: done with {}".format(msgStr)) return True, msgStr else: msgStr = 'failed' if log_stream: log_stream.error(msgStr) else: print(msgStr) return False, msgStr
# get low priority ES jobs per site sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime " sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA) sqlEsJobs += "WHERE prodSourceLabel IN (:label1,:label2) AND eventService=:es " sqlEsJobs += "AND currentPriority<:prio AND jobStatus=:jobStat " sqlEsJobs += "ORDER BY currentPriority,PandaID " varMap = {} varMap[':label1'] = 'managed' varMap[':label2'] = 'test' varMap[':es'] = 1 varMap[':prio'] = 200 varMap[':jobStat'] = 'running' # exec status, res = taskBuffer.querySQLS(sqlEsJobs, varMap, arraySize=100000) if res is None: tmpLog.debug("total %s " % res) else: tmpLog.debug("total %s " % len(res)) # get number of jobs per site siteJobsMap = {} for pandaID, siteName, commandToPilot, startTime in res: if siteName not in siteJobsMap: siteJobsMap[siteName] = {'running': [], 'killing': []} if commandToPilot == 'tobekilled': siteJobsMap[siteName]['killing'].append(pandaID) else: # kill only old jobs if startTime < timeLimit: siteJobsMap[siteName]['running'].append(pandaID)
def main(argv=tuple(), tbuf=None, **kwargs): try: long except NameError: long = int tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # current minute currentMinute = datetime.datetime.utcnow().minute # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # delete tmpLog.debug("Del session") status, retSel = taskBuffer.querySQLS( "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {}) if retSel is not None: try: maxID = retSel[0][0] tmpLog.debug("maxID : %s" % maxID) if maxID is not None: varMap = {} varMap[':maxID'] = maxID varMap[':jobStatus1'] = 'activated' varMap[':jobStatus2'] = 'waiting' varMap[':jobStatus3'] = 'failed' varMap[':jobStatus4'] = 'cancelled' status, retDel = taskBuffer.querySQLS( "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)", varMap) except Exception: pass # count # of getJob/updateJob in dispatcher's log try: # don't update when logrotate is running timeNow = datetime.datetime.utcnow() logRotateTime = timeNow.replace(hour=3, minute=2, second=0, microsecond=0) if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \ (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)): tmpLog.debug("skip pilotCounts session for logrotate") else: # log filename dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( hours=3) timeLimitS = datetime.datetime.utcnow() - datetime.timedelta( hours=1) # check if tgz is required com = 'head -1 %s' % dispLogName lostat, loout = commands_get_status_output(com) useLogTgz = True if lostat == 0: match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', loout) if match is not None: startTime = datetime.datetime(*time.strptime( match.group(0), '%Y-%m-%d %H:%M:%S')[:6]) # current log contains all info if startTime < timeLimit: useLogTgz = False # log files dispLogNameList = [dispLogName] if useLogTgz: today = datetime.date.today() dispLogNameList.append('{0}-{1}.gz'.format( dispLogName, today.strftime('%Y%m%d'))) # delete tmp commands_get_status_output('rm -f %s.tmp-*' % dispLogName) # tmp name tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow( ).strftime('%Y-%m-%d-%H-%M-%S')) # loop over all files pilotCounts = {} pilotCountsS = {} for tmpDispLogName in dispLogNameList: # expand or copy if tmpDispLogName.endswith('.gz'): com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName) else: com = 'cp %s %s' % (tmpDispLogName, tmpLogName) lostat, loout = commands_get_status_output(com) if lostat != 0: errMsg = 'failed to expand/copy %s with : %s' % ( tmpDispLogName, loout) raise RuntimeError(errMsg) # search string sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*' sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)' # read logFH = open(tmpLogName) for line in logFH: # check format match = re.search(sStr, line) if match is not None: # check timerange timeStamp = datetime.datetime(*time.strptime( match.group(1), '%Y-%m-%d %H:%M:%S')[:6]) if timeStamp < timeLimit: continue tmpMethod = match.group(2) tmpSite = match.group(3) tmpNode = match.group(4) tmpType = match.group(5) # protection against corrupted entries from pilot, # e.g. pilot reading site json from cvmfs while it was being updated if tmpSite not in aSiteMapper.siteSpecList: continue # sum pilotCounts.setdefault(tmpSite, {}) pilotCounts[tmpSite].setdefault(tmpMethod, {}) pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0) pilotCounts[tmpSite][tmpMethod][tmpNode] += 1 # short if timeStamp > timeLimitS: if tmpSite not in pilotCountsS: pilotCountsS[tmpSite] = dict() if tmpMethod not in pilotCountsS[tmpSite]: pilotCountsS[tmpSite][tmpMethod] = dict() if tmpNode not in pilotCountsS[tmpSite][tmpMethod]: pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0 pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1 # close logFH.close() # delete tmp commands_get_status_output('rm %s' % tmpLogName) # update hostID = panda_config.pserverhost.split('.')[0] tmpLog.debug("pilotCounts session") retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3) tmpLog.debug(retPC) retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1) tmpLog.debug(retPC) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue)) # nRunning tmpLog.debug("nRunning session") try: if (currentMinute / panda_config.nrun_interval ) % panda_config.nrun_hosts == panda_config.nrun_snum: retNR = taskBuffer.insertnRunningInSiteData() tmpLog.debug(retNR) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("nRunning : %s %s" % (errType, errValue)) # session for co-jumbo jobs tmpLog.debug("co-jumbo session") try: ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000) if ret is None: tmpLog.debug("failed to get co-jumbo jobs to finish") else: coJumboA, coJumboD, coJumboW, coJumboTokill = ret tmpLog.debug("finish {0} co-jumbo jobs in Active".format( len(coJumboA))) if len(coJumboA) > 0: jobSpecs = taskBuffer.peekJobs(coJumboA, fromDefined=False, fromActive=True, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False) tmpLog.debug("finish {0} co-jumbo jobs in Defined".format( len(coJumboD))) if len(coJumboD) > 0: jobSpecs = taskBuffer.peekJobs(coJumboD, fromDefined=True, fromActive=False, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], True) tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format( len(coJumboW))) if len(coJumboW) > 0: jobSpecs = taskBuffer.peekJobs(coJumboW, fromDefined=False, fromActive=False, fromArchived=False, fromWaiting=True) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except Exception: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName def run(self): if 'VIRTUAL_ENV' in os.environ: prefix = os.environ['VIRTUAL_ENV'] else: prefix = '' setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix) runStr = '%s/python -Wignore ' % panda_config.native_python runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i ' runStr += self.fileName if self.fileName.split('/')[-1].startswith('set.NULL.'): runStr += ' -t' comStr = setupStr + runStr tmpLog.debug(comStr) commands_get_status_output(comStr) # get set.* files filePatt = panda_config.logdir + '/' + 'set.*' fileList = glob.glob(filePatt) # the max number of threads maxThr = 10 nThr = 0 # loop over all files forkThrList = [] timeNow = datetime.datetime.utcnow() for tmpName in fileList: if not os.path.exists(tmpName): continue try: # takes care of only recent files modTime = datetime.datetime( *(time.gmtime(os.path.getmtime(tmpName))[:7])) if (timeNow - modTime) > datetime.timedelta(minutes=1) and \ (timeNow - modTime) < datetime.timedelta(hours=1): cSt, cOut = commands_get_status_output( 'ps aux | grep fork | grep -v PYTH') # if no process is running for the file if cSt == 0 and tmpName not in cOut: nThr += 1 thr = ForkThr(tmpName) thr.start() forkThrList.append(thr) if nThr > maxThr: break except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("%s %s" % (errType, errValue)) # join fork threads for thr in forkThrList: thr.join() # terminate TaskBuffer IF # taskBufferIF.terminate() tmpLog.debug("===================== end =====================")
taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate MyProxy I/F my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface() # roles if hasattr(panda_config, 'proxy_cache_roles'): roles = panda_config.proxy_cache_roles.split(',') else: roles = [ 'atlas', 'atlas:/atlas/Role=production', 'atlas:/atlas/Role=pilot' ] # get users sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt' varMap = {} varMap[':patt'] = '%p%' tmpStat, tmpRes = taskBuffer.querySQLS(sql, varMap) for realDN, in tmpRes: if realDN is None: continue realDN = re.sub('/CN=limited proxy', '', realDN) realDN = re.sub('(/CN=proxy)+', '', realDN) realDN = re.sub('(/CN=\d+)+$', '', realDN) # check proxy tmpLog.debug("check proxy cache for DN={0}".format(realDN)) for role in roles: my_proxy_interface_instance.checkProxy(realDN, role=role) tmpLog.debug("done")