def uploadLog(self, id): strMsg = self.dumpToString() s, o = Client.uploadLog(strMsg, id) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def uploadLog(self,id): strMsg = self.dumpToString() s,o = Client.uploadLog(strMsg,id) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def doCheck(self,taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL,{} retTmpError = self.SC_FAILED,{} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug('check with panda') tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpTaskID,tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug('jediTaskID={0} -> {1}'.format(tmpTaskID,tmpCoreName)) if not tmpCoreName in ['NULL','',None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get nucleus nucleus = siteSpec.pandasite # get output/log datasets tmpStat,tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI(tmpTaskID,['output','log']) # get destinations retMap[tmpTaskID] = {'datasets':[],'nucleus':nucleus} for datasetSpec in tmpDatasetSpecs: # skip distributed datasets if DataServiceUtils.getDistributedDestination(datasetSpec.storageToken) != None: continue # get token token = ddmIF.convertTokenToEndpoint(siteSpec.ddm,datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm # add origianl token if not datasetSpec.storageToken in ['',None]: token += '/{0}'.format(datasetSpec.storageToken) retMap[tmpTaskID]['datasets'].append({'datasetID':datasetSpec.datasetID, 'token':'dst:{0}'.format(token), 'destination':tmpCoreName}) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED,retMap
def uploadLog(self): if self.jediTaskID is None: return 'cannot find jediTaskID' strMsg = self.logger.dumpToString() s, o = Client.uploadLog(strMsg, self.jediTaskID) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def doCheck(self, taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug("start doCheck") # return for failure retFatal = self.SC_FATAL, {} retTmpError = self.SC_FAILED, {} # get list of jediTaskIDs taskIdList = [] taskSpecMap = {} for taskSpec in taskSpecList: taskIdList.append(taskSpec.jediTaskID) taskSpecMap[taskSpec.jediTaskID] = taskSpec # check with panda tmpLog.debug("check with panda") tmpPandaStatus, cloudsInPanda = PandaClient.seeCloudTask(taskIdList) if tmpPandaStatus != 0: tmpLog.error("failed to see clouds") return retTmpError # make return map retMap = {} for tmpTaskID, tmpCoreName in cloudsInPanda.iteritems(): tmpLog.debug("jediTaskID={0} -> {1}".format(tmpTaskID, tmpCoreName)) if not tmpCoreName in ["NULL", "", None]: taskSpec = taskSpecMap[tmpTaskID] if taskSpec.useWorldCloud(): # get destinations for WORLD cloud ddmIF = self.ddmIF.getInterface(taskSpec.vo) # get site siteSpec = self.siteMapper.getSite(tmpCoreName) # get output/log datasets tmpStat, tmpDatasetSpecs = self.taskBufferIF.getDatasetsWithJediTaskID_JEDI( tmpTaskID, ["output", "log"] ) # get destinations retMap[tmpTaskID] = [] for datasetSpec in tmpDatasetSpecs: token = ddmIF.convertTokenToEndpoint(siteSpec.ddm, datasetSpec.storageToken) # use default endpoint if token == None: token = siteSpec.ddm retMap[tmpTaskID].append( { "datasetID": datasetSpec.datasetID, "token": "dst:{0}".format(token), "destination": tmpCoreName, } ) else: retMap[tmpTaskID] = tmpCoreName tmpLog.debug("ret {0}".format(str(retMap))) # return tmpLog.debug("done") return self.SC_SUCCEEDED, retMap
def getStatus(self, expectedStates): idList = [job['jobID'] for job in self.__jobList] _logger.info("%s" % idList) status, jobInfoList = Client.getJobStatus(idList) _logger.info("%s" % jobInfoList) assert status == 0, "Retrieval of job state finished with status: %s" % status for job in jobInfoList: assert job.jobStatus in expectedStates, "Recently defined job was not in states %s (PandaID: %s jobStatus: %s)" % ( expectedStates, job.PandaID, job.jobStatus) return jobInfoList
def doCheck(self,taskSpecList): # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doCheck') # return for failure retFatal = self.SC_FATAL,{} retTmpError = self.SC_FAILED,{} # get list of reqIDs wchih are mapped to taskID in Panda reqIdTaskIdMap = {} for taskSpec in taskSpecList: if taskSpec.reqID != None: if reqIdTaskIdMap.has_key(taskSpec.reqID): tmpLog.error('reqID={0} is dubplicated in jediTaskID={1},{2}'.format(taskSpec.reqID, taskSpec.jediTaskID, reqIdTaskIdMap[taskSpec.reqID])) else: reqIdTaskIdMap[taskSpec.reqID] = taskSpec.jediTaskID tmpLog.debug('jediTaskID={0} has reqID={1}'.format(taskSpec.jediTaskID,taskSpec.reqID)) else: tmpLog.error('jediTaskID={0} has undefined reqID'.format(taskSpec.jediTaskID)) # check with panda tmpLog.debug('check with panda') tmpPandaStatus,cloudsInPanda = PandaClient.seeCloudTask(reqIdTaskIdMap.keys()) if tmpPandaStatus != 0: tmpLog.error('failed to see clouds') return retTmpError # make return map retMap = {} for tmpReqID,tmpCloud in cloudsInPanda.iteritems(): if not tmpCloud in ['NULL','',None]: tmpLog.debug('reqID={0} jediTaskID={1} -> {2}'.format(tmpReqID,reqIdTaskIdMap[tmpReqID],tmpCloud)) """ # check file availability tmpSt = self.findMissingFiles(reqIdTaskIdMap[tmpReqID],tmpCloud) if tmpSt != self.SC_SUCCEEDED: tmpLog.error('failed to check file availability for jediTaskID={0}'.format(reqIdTaskIdMap[tmpReqID])) continue """ retMap[reqIdTaskIdMap[tmpReqID]] = tmpCloud tmpLog.debug('ret {0}'.format(str(retMap))) # return tmpLog.debug('done') return self.SC_SUCCEEDED,retMap
def generateJobs(self): for i in range(self.__nJobs): job = self.defineEvgen16Job(i) self.__jobList.append({'jobSpec': job, 'jobID': None}) status, output = Client.submitJobs( [job['jobSpec'] for job in self.__jobList] ) #Return from submitJobs: ret.append((job.PandaID,job.jobDefinitionID,{'jobsetID':job.jobsetID})) assert status == 0, "Submission of jobs finished with status: %s" % status assert len(self.__jobList) == len( output), "Not all jobs seem to have been submitted properly" for job, ids in zip(self.__jobList, output): jobID = ids[0] job['jobID'] = jobID _logger.info("Generated job PandaID = %s" % jobID) return
def main(taskBuffer=None, exec_options=None, log_stream=None, args_list=None): # options parser = argparse.ArgumentParser() if taskBuffer: parser.add_argument('--ds',action='store',dest='ds',default=None, help='dataset name') else: parser.add_argument('--ds',action='store',dest='ds',default=None,required=True, help='dataset name') parser.add_argument('--files',action='store',dest='files',default=None, help='comma-separated list of lost file names. The list is dedeuced if this option is omitted') parser.add_argument('--noChildRetry',action='store_const',const=True,dest='noChildRetry',default=False, help='not retry child tasks') parser.add_argument('--resurrectDS',action='store_const',const=True,dest='resurrectDS',default=False, help='resurrect output and log datasets if they were already deleted') parser.add_argument('--dryRun',action='store_const',const=True,dest='dryRun',default=False, help='dry run') parser.add_argument('--force', action='store_const', const=True, dest='force', default=False, help='force retry even if no lost files') parser.add_argument('--reproduceParent', action='store_const', const=True, dest='reproduceParent', default=False, help='reproduce the input files from which the lost files were produced. ' 'Typically useful to recover merged files when unmerged files were already deleted') # parse options if taskBuffer: if args_list: options = parser.parse_args(args_list) else: options, unknown = parser.parse_known_args() else: if args_list: options = parser.parse_args(args_list) else: options = parser.parse_args() # executed via command-line givenTaskID = None dn = None if taskBuffer is None: # instantiate TB from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) else: # set options from dict if exec_options is None: exec_options = {} keys = set(vars(options).keys()) for k in exec_options: if k in keys: setattr(options, k, exec_options[k]) if 'jediTaskID' in exec_options: givenTaskID = exec_options['jediTaskID'] if 'userName' in exec_options: dn = exec_options['userName'] ds_files = {} if options.files is not None: files = options.files.split(',') ds_files[options.ds] = files else: # look for lost files if not givenTaskID: # get files from rucio st, files_rucio = get_files_from_rucio(options.ds, log_stream) if st is not True: return st, files_rucio # get files from panda dsName = options.ds.split(':')[-1] fd, fo = taskBuffer.querySQLS( 'SELECT c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c ' 'WHERE c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND ' 'd.type IN (:t1,:t2) AND c.status=:s AND d.datasetName=:name ', {':s': 'finished', ':t1': 'output', ':t2': 'log', ':name': dsName}) for tmpLFN, in fo: if tmpLFN not in files_rucio: ds_files.setdefault(options.ds, []) ds_files[options.ds].append(tmpLFN) # get taskID td, to = taskBuffer.querySQLS( 'SELECT jediTaskID FROM ATLAS_PANDA.JEDI_Datasets ' 'WHERE datasetName=:datasetName AND type IN (:t1,:t2) ', {':t1': 'output', ':t2': 'log', ':datasetName': dsName}) jediTaskID, = to[0] else: # get dataset names dd, do = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets ' 'WHERE jediTaskID=:jediTaskID AND type IN (:t1,:t2) ', {':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID}) # get files from rucio files_rucio = set() for tmpDS, in do: st, tmp_files_rucio = get_files_from_rucio(tmpDS, log_stream) if st is None: return st, tmp_files_rucio # ignore unknown dataset if st: files_rucio = files_rucio.union(tmp_files_rucio) # get files from rucio fd, fo = taskBuffer.querySQLS( 'SELECT d.datasetName,c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c ' 'WHERE d.jediTaskID=:jediTaskID AND c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND ' 'd.type IN (:t1,:t2) AND c.status=:s ', {':s': 'finished', ':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID}) for tmpDS, tmpLFN in fo: if tmpLFN not in files_rucio: ds_files.setdefault(tmpDS, []) ds_files[tmpDS].append(tmpLFN) for tmpDS in ds_files: files = ds_files[tmpDS] msgStr = '{} has {} lost files -> {}'.format(tmpDS, len(files), ','.join(files)) if log_stream: log_stream.info(msgStr) else: print(msgStr) # no lost files if not ds_files and not options.force: return True, "No lost files. Use --force to ignore this check" # reset file status s = False for tmpDS in ds_files: files = ds_files[tmpDS] if dn: ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI(dn, False, tmpDS, files, options.reproduceParent, options.dryRun) else: ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI('', True, tmpDS, files, options.reproduceParent, options.dryRun) msgStr = 'reset file status for {} in the DB: done with {} for jediTaskID={}'.format(tmpDS, ts, jediTaskID) if log_stream: log_stream.info(msgStr) else: print(msgStr) s |= ts # recover parent if options.reproduceParent: # reproduce input for lostDS in lostInputFiles: com_args = ['--ds', lostDS, '--noChildRetry', '--resurrectDS'] if options.dryRun: com_args.append('--dryRun') com_args += ['--files', ','.join(lostInputFiles[lostDS])] main(taskBuffer=taskBuffer, log_stream=log_stream, args_list=com_args) # go ahead if options.dryRun: return True, 'Done in the dry-run mode with {}'.format(s) if s or options.force: if options.resurrectDS: sd,so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', {':id': jediTaskID, ':t1': 'output', ':t2': 'log'}) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print('resurrect {0}'.format(datasetName)) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except Exception: pass if not options.reproduceParent: msgStr = Client.retryTask(jediTaskID, noChildRetry=options.noChildRetry)[-1][-1] else: msgStr = Client.reloadInput(jediTaskID)[-1][-1] if log_stream: log_stream.info("Retried task with {}".format(msgStr)) log_stream.info("Done") else: print("Retried task: done with {}".format(msgStr)) return True, msgStr else: msgStr = 'failed' if log_stream: log_stream.error(msgStr) else: print(msgStr) return False, msgStr
def main(argv=tuple(), tbuf=None, **kwargs): try: long except NameError: long = int tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # current minute currentMinute = datetime.datetime.utcnow().minute # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # delete tmpLog.debug("Del session") status, retSel = taskBuffer.querySQLS( "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {}) if retSel is not None: try: maxID = retSel[0][0] tmpLog.debug("maxID : %s" % maxID) if maxID is not None: varMap = {} varMap[':maxID'] = maxID varMap[':jobStatus1'] = 'activated' varMap[':jobStatus2'] = 'waiting' varMap[':jobStatus3'] = 'failed' varMap[':jobStatus4'] = 'cancelled' status, retDel = taskBuffer.querySQLS( "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)", varMap) except Exception: pass # count # of getJob/updateJob in dispatcher's log try: # don't update when logrotate is running timeNow = datetime.datetime.utcnow() logRotateTime = timeNow.replace(hour=3, minute=2, second=0, microsecond=0) if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \ (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)): tmpLog.debug("skip pilotCounts session for logrotate") else: # log filename dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( hours=3) timeLimitS = datetime.datetime.utcnow() - datetime.timedelta( hours=1) # check if tgz is required com = 'head -1 %s' % dispLogName lostat, loout = commands_get_status_output(com) useLogTgz = True if lostat == 0: match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', loout) if match is not None: startTime = datetime.datetime(*time.strptime( match.group(0), '%Y-%m-%d %H:%M:%S')[:6]) # current log contains all info if startTime < timeLimit: useLogTgz = False # log files dispLogNameList = [dispLogName] if useLogTgz: today = datetime.date.today() dispLogNameList.append('{0}-{1}.gz'.format( dispLogName, today.strftime('%Y%m%d'))) # delete tmp commands_get_status_output('rm -f %s.tmp-*' % dispLogName) # tmp name tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow( ).strftime('%Y-%m-%d-%H-%M-%S')) # loop over all files pilotCounts = {} pilotCountsS = {} for tmpDispLogName in dispLogNameList: # expand or copy if tmpDispLogName.endswith('.gz'): com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName) else: com = 'cp %s %s' % (tmpDispLogName, tmpLogName) lostat, loout = commands_get_status_output(com) if lostat != 0: errMsg = 'failed to expand/copy %s with : %s' % ( tmpDispLogName, loout) raise RuntimeError(errMsg) # search string sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*' sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)' # read logFH = open(tmpLogName) for line in logFH: # check format match = re.search(sStr, line) if match is not None: # check timerange timeStamp = datetime.datetime(*time.strptime( match.group(1), '%Y-%m-%d %H:%M:%S')[:6]) if timeStamp < timeLimit: continue tmpMethod = match.group(2) tmpSite = match.group(3) tmpNode = match.group(4) tmpType = match.group(5) # protection against corrupted entries from pilot, # e.g. pilot reading site json from cvmfs while it was being updated if tmpSite not in aSiteMapper.siteSpecList: continue # sum pilotCounts.setdefault(tmpSite, {}) pilotCounts[tmpSite].setdefault(tmpMethod, {}) pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0) pilotCounts[tmpSite][tmpMethod][tmpNode] += 1 # short if timeStamp > timeLimitS: if tmpSite not in pilotCountsS: pilotCountsS[tmpSite] = dict() if tmpMethod not in pilotCountsS[tmpSite]: pilotCountsS[tmpSite][tmpMethod] = dict() if tmpNode not in pilotCountsS[tmpSite][tmpMethod]: pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0 pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1 # close logFH.close() # delete tmp commands_get_status_output('rm %s' % tmpLogName) # update hostID = panda_config.pserverhost.split('.')[0] tmpLog.debug("pilotCounts session") retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3) tmpLog.debug(retPC) retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1) tmpLog.debug(retPC) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue)) # nRunning tmpLog.debug("nRunning session") try: if (currentMinute / panda_config.nrun_interval ) % panda_config.nrun_hosts == panda_config.nrun_snum: retNR = taskBuffer.insertnRunningInSiteData() tmpLog.debug(retNR) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("nRunning : %s %s" % (errType, errValue)) # session for co-jumbo jobs tmpLog.debug("co-jumbo session") try: ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000) if ret is None: tmpLog.debug("failed to get co-jumbo jobs to finish") else: coJumboA, coJumboD, coJumboW, coJumboTokill = ret tmpLog.debug("finish {0} co-jumbo jobs in Active".format( len(coJumboA))) if len(coJumboA) > 0: jobSpecs = taskBuffer.peekJobs(coJumboA, fromDefined=False, fromActive=True, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False) tmpLog.debug("finish {0} co-jumbo jobs in Defined".format( len(coJumboD))) if len(coJumboD) > 0: jobSpecs = taskBuffer.peekJobs(coJumboD, fromDefined=True, fromActive=False, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], True) tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format( len(coJumboW))) if len(coJumboW) > 0: jobSpecs = taskBuffer.peekJobs(coJumboW, fromDefined=False, fromActive=False, fromArchived=False, fromWaiting=True) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except Exception: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName def run(self): if 'VIRTUAL_ENV' in os.environ: prefix = os.environ['VIRTUAL_ENV'] else: prefix = '' setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix) runStr = '%s/python -Wignore ' % panda_config.native_python runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i ' runStr += self.fileName if self.fileName.split('/')[-1].startswith('set.NULL.'): runStr += ' -t' comStr = setupStr + runStr tmpLog.debug(comStr) commands_get_status_output(comStr) # get set.* files filePatt = panda_config.logdir + '/' + 'set.*' fileList = glob.glob(filePatt) # the max number of threads maxThr = 10 nThr = 0 # loop over all files forkThrList = [] timeNow = datetime.datetime.utcnow() for tmpName in fileList: if not os.path.exists(tmpName): continue try: # takes care of only recent files modTime = datetime.datetime( *(time.gmtime(os.path.getmtime(tmpName))[:7])) if (timeNow - modTime) > datetime.timedelta(minutes=1) and \ (timeNow - modTime) < datetime.timedelta(hours=1): cSt, cOut = commands_get_status_output( 'ps aux | grep fork | grep -v PYTH') # if no process is running for the file if cSt == 0 and tmpName not in cOut: nThr += 1 thr = ForkThr(tmpName) thr.start() forkThrList.append(thr) if nThr > maxThr: break except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("%s %s" % (errType, errValue)) # join fork threads for thr in forkThrList: thr.join() # terminate TaskBuffer IF # taskBufferIF.terminate() tmpLog.debug("===================== end =====================")
def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue): # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug("start doBrokerage") # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug("vo={0} label={1} queue={2}".format(vo, prodSourceLabel, workQueue.queue_name)) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID, tmpInputList in inputList: for taskSpec, cloudName, inputChunk in tmpInputList: # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = "managed" jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock == False and prodDBlock != None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if not allRwMap.has_key(jobSpec.currentPriority): tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, jobSpec.currentPriority ) if tmpRW == None: tmpLog.error("failed to calculate RW with prio={0}".format(jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID) if expRW == None: tmpLog.error("failed to calculate RW for jediTaskID={0}".format(jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo, prodSourceLabel, None, None) if fullRWs == None: tmpLog.error("failed to calculate full RW") return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % ( jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap), str(fullRWs), str(tt2Map), ) tmpLog.debug("run task assigner for {0} tasks".format(len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask : nBunchTask + maxBunchTask] strIDs = "jediTaskID=" for tmpJobSpec in jobsBunch: strIDs += "{0},".format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS, outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug("{0}:{1}".format(stS, str(outSs))) # return tmpLog.debug("done") return self.SC_SUCCEEDED
sys.exit(0) if s: if options.resurrectDS: sd, so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', { ':id': jediTaskID, ':t1': 'output', ':t2': 'log' }) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print('resurrect {0}'.format(datasetName)) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except Exception: pass print( Client.retryTask(jediTaskID, noChildRetry=options.noChildRetry)[-1][-1]) print('done for jediTaskID={0}'.format(jediTaskID)) else: print('failed')
jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except Exception: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName
import optparse import pandaserver.userinterface.Client as Client optP = optparse.OptionParser(conflict_handler="resolve") options,args = optP.parse_args() jediTaskID = args[0] s,o = Client.finishTask(jediTaskID) print(o)
proxyS = DBProxy() proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) jobs = [] varMap = {} varMap[':prodSourceLabel'] = 'managed' varMap[':taskID'] = args[0] varMap[':pandaIDl'] = args[1] varMap[':pandaIDu'] = args[2] sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND PandaID BETWEEN :pandaIDl AND :pandaIDu ORDER BY PandaID" for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: status,res = proxyS.querySQLS(sql % table,varMap) if res is not None: for id, in res: if not id in jobs: jobs.append(id) print('The number of jobs to be killed : %s' % len(jobs)) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print('kill %s' % str(jobs[iJob:iJob+nJob])) if options.forceKill: Client.killJobs(jobs[iJob:iJob+nJob],9,useMailAsID=useMailAsIDV) else: Client.killJobs(jobs[iJob:iJob+nJob],useMailAsID=useMailAsIDV) iJob += nJob time.sleep(1)
job.destinationDBlock = datasetName job.destinationSE = 'local' job.currentPriority = 1000 job.prodSourceLabel = 'panda' job.jobParameters = ' --lsstJobParams="%s" ' % lsstJobParams if prodUserName is not None: job.prodUserName = prodUserName else: job.prodUserName = prodUserNameDefault if PIPELINE_PROCESSINSTANCE is not None: job.taskID = PIPELINE_PROCESSINSTANCE if PIPELINE_EXECUTIONNUMBER is not None: job.attemptNr = PIPELINE_EXECUTIONNUMBER if PIPELINE_TASK is not None: job.processingType = PIPELINE_TASK job.computingSite = site job.VO = "lsst" fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % job.jobName fileOL.destinationDBlock = job.destinationDBlock fileOL.destinationSE = job.destinationSE fileOL.dataset = job.destinationDBlock fileOL.type = 'log' job.addFile(fileOL) s, o = Client.submitJobs([job], srvID=aSrvID) print(s) for x in o: print("PandaID=%s" % x[0])
sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsDefined4 " else: sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsActive4 " sql += "WHERE jobStatus=:jobStatus AND computingSite=:computingSite AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" status, res = proxyS.querySQLS(sql, varMap) print("got {0} jobs".format(len(res))) jobs = [] jediJobs = [] if res is not None: for (id, lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print('reassign %s' % str(jobs[iJob:iJob + nJob])) Client.reassignJobs(jobs[iJob:iJob + nJob]) iJob += nJob if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print('kill JEDI jobs %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51) iJob += nJob
import pandaserver.userinterface.Client as Client from pandaserver.userinterface.Client import baseURLSSL from pandaserver.taskbuffer.TaskBuffer import taskBuffer from pandaserver.brokerage.SiteMapper import SiteMapper from pandaserver.config import panda_config from pandaserver.dataservice import DataServiceUtils from pandasever.dataservice.DataServiceUtils import select_scope # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) id = sys.argv[1] s, o = Client.getJobStatus([id]) if s != 0: print("failed to get job with:%s" % s) sys.exit(0) job = o[0] if job is None: print("got None") sys.exit(0) xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!-- ATLAS file meta-data catalog --> <!DOCTYPE POOLFILECATALOG SYSTEM "InMemory"> <POOLFILECATALOG>
jediTaskID = int(options.tid) if True: if options.resurrectDS: sd, so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', { ':id': jediTaskID, ':t1': 'output', ':t2': 'log' }) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print('resurrect {0}'.format(datasetName)) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except: pass print(Client.reloadInput(jediTaskID)[-1]) print('done for jediTaskID={0}'.format(jediTaskID)) else: print('failed')
if options.forceKill: codeV = 9 elif options.killUserJobs: codeV = 91 else: try: codeV = int(options.codeV) except Exception: pass if options.killOwnProdJobs: useMailAsIDV = True if len(args) == 1: Client.killJobs([args[0]], code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged, jobSubStatus=options.jobSubStatus) else: startID = int(args[0]) endID = int(args[1]) if startID > endID: print('%d is less than %d' % (endID, startID)) sys.exit(1) Client.killJobs(range(startID, endID + 1), code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged, jobSubStatus=options.jobSubStatus)
# time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=1) # instantiate DB proxies proxyS = DBProxy() proxyS.connect(panda_config.dbhost, panda_config.dbpasswd, panda_config.dbuser, panda_config.dbname) while True: # get PandaIDs varMap = {} varMap[':modificationTime'] = timeLimit sql = "SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE modificationTime<:modificationTime ORDER BY PandaID" status, res = proxyS.querySQLS(sql, varMap) # escape if len(res) == 0: break # convert to list jobs = [] for id, in res: jobs.append(id) # reassign nJob = 300 iJob = 0 while iJob < len(jobs): print('reassignJobs(%s)' % jobs[iJob:iJob + nJob]) Client.reassignJobs(jobs[iJob:iJob + nJob]) iJob += nJob time.sleep(60)
def putFile(req, file): if not Protocol.isSecure(req): return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False _logger.debug("putFile : start %s %s" % (req.subprocess_env['SSL_CLIENT_S_DN'], file.filename)) # size check fullSizeLimit = 768 * 1024 * 1024 if not file.filename.startswith('sources.'): noBuild = True sizeLimit = 100 * 1024 * 1024 else: noBuild = False sizeLimit = fullSizeLimit # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except Exception: if "content-length" in req.headers_in: _logger.error("cannot get CL : %s" % req.headers_in["content-length"]) else: _logger.error("no CL") _logger.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "ERROR : Upload failure. Exceeded size limit %s>%s." % ( contentLength, sizeLimit) if noBuild: errStr += " Please submit the job without --noBuild/--libDS since those options impose a tighter size limit" else: errStr += " Please remove redundant files from your workarea" _logger.error(errStr) _logger.debug("putFile : end") return errStr try: fileFullPath = '%s/%s' % (panda_config.cache_dir, file.filename.split('/')[-1]) # avoid overwriting if os.path.exists(fileFullPath): # touch os.utime(fileFullPath, None) # send error message errStr = "ERROR : Cannot overwrite file" _logger.debug('putFile : cannot overwrite file %s' % file.filename) _logger.debug("putFile : end") return errStr # write fo = open(fileFullPath, 'wb') fileContent = file.file.read() fo.write(fileContent) fo.close() except Exception: errStr = "ERROR : Cannot write file" _logger.error(errStr) _logger.debug("putFile : end") return errStr # checksum try: # decode Footer footer = fileContent[-8:] checkSum, isize = struct.unpack("II", footer) _logger.debug("CRC from gzip Footer %s" % checkSum) except Exception: # calculate on the fly """ import zlib checkSum = zlib.adler32(fileContent) & 0xFFFFFFFF """ # use None to avoid delay for now checkSum = None _logger.debug("CRC calculated %s" % checkSum) # file size fileSize = len(fileContent) # user name username = cleanUserID(req.subprocess_env['SSL_CLIENT_S_DN']) _logger.debug("putFile : written dn=%s file=%s size=%s crc=%s" % \ (username,file.filename,fileSize,checkSum)) # put file info to DB statClient, outClient = Client.insertSandboxFileInfo( username, file.filename, fileSize, checkSum) if statClient != 0 or outClient.startswith("ERROR"): _logger.error("putFile : failed to put sandbox to DB with %s %s" % (statClient, outClient)) #_logger.debug("putFile : end") #return "ERROR : Cannot insert sandbox to DB" else: _logger.debug("putFile : inserted sandbox to DB with %s" % outClient) # store to cassandra if hasattr(panda_config, 'cacheUseCassandra') and panda_config.cacheUseCassandra == True: try: # time-stamp timeNow = datetime.datetime.utcnow() creationTime = timeNow.strftime('%Y-%m-%d %H:%M:%S') # user name username = req.subprocess_env['SSL_CLIENT_S_DN'] username = username.replace('/CN=proxy', '') username = username.replace('/CN=limited proxy', '') # file size fileSize = len(fileContent) # key fileKeyName = file.filename.split('/')[-1] sizeCheckSum = '%s:%s' % (fileSize, checkSum) # insert to cassandra import pycassa pool = pycassa.ConnectionPool(panda_config.cacheKeySpace) filefamily = pycassa.ColumnFamily(pool, panda_config.cacheFileTable) # avoid overwriting gotoNextCassa = True if filefamily.get_count(fileKeyName) > 0: # touch touchFlag = touchFileCassa(filefamily, fileKeyName, timeNow) if touchFlag: gotoNextCassa = False # send error message errStr = "ERROR : Cannot overwrite file in Cassandra" _logger.error(errStr) if not panda_config.cacheIgnoreCassandraError: _logger.debug("putFile : end") return errStr # check uniqueness with size and checksum if gotoNextCassa: try: uniqExp = pycassa.index.create_index_expression( 'uniqID', sizeCheckSum) userExp = pycassa.index.create_index_expression( 'user', username) tmpClause = pycassa.index.create_index_clause( [uniqExp, userExp]) tmpResults = filefamily.get_indexed_slices( tmpClause, columns=['creationTime']) for oldFileKeyName, tmpDict in tmpResults: _logger.debug('The same size and chksum %s found in old:%s and new:%s' % \ (sizeCheckSum,oldFileKeyName,fileKeyName)) # touch touchFlag = touchFileCassa(filefamily, oldFileKeyName, timeNow) if touchFlag: # make alias _logger.debug('Making alias %s->%s' % (fileKeyName, oldFileKeyName)) insertWithRetryCassa( filefamily, fileKeyName, { 'alias': oldFileKeyName, 'creationTime': creationTime, 'nSplit': 0, }, 'putFile : make alias for %s' % file.filename) # set time touchFileCassa(filefamily, fileKeyName, timeNow) _logger.debug("putFile : end") return True except Exception: gotoNextCassa = False errType, errValue = sys.exc_info()[:2] errStr = "cannot make alias for %s due to %s %s" % ( fileKeyName, errType, errValue) _logger.error(errStr) if not panda_config.cacheIgnoreCassandraError: _logger.debug("putFile : end") return errStr # insert new record if gotoNextCassa: splitIdx = 0 splitSize = 5 * 1024 * 1024 nSplit, tmpMod = divmod(len(fileContent), splitSize) if tmpMod != 0: nSplit += 1 _logger.debug('Inserting %s with %s blocks' % (fileKeyName, nSplit)) for splitIdx in range(nSplit): # split to small chunks since cassandra is not good at large files tmpFileContent = fileContent[splitSize * splitIdx:splitSize * (splitIdx + 1)] tmpFileKeyName = fileKeyName tmpAttMap = { 'file': tmpFileContent, 'user': username, 'creationTime': creationTime, } if splitIdx == 0: tmpAttMap['size'] = fileSize tmpAttMap['nSplit'] = nSplit tmpAttMap['uniqID'] = sizeCheckSum tmpAttMap['checkSum'] = str(checkSum) else: tmpFileKeyName += '_%s' % splitIdx tmpAttMap['size'] = 0 tmpAttMap['nSplit'] = 0 # insert with retry insertWithRetryCassa(filefamily, tmpFileKeyName, tmpAttMap, 'putFile : insert %s' % file.filename) # set time touchFileCassa(filefamily, fileKeyName, timeNow) except Exception: errType, errValue = sys.exc_info()[:2] errStr = "cannot put %s into Cassandra due to %s %s" % ( fileKeyName, errType, errValue) _logger.error(errStr) # send error message errStr = "ERROR : " + errStr if not panda_config.cacheIgnoreCassandraError: _logger.debug("putFile : end") return errStr _logger.debug("putFile : %s end" % file.filename) return True
import sys import pandaserver.userinterface.Client as Client if len(sys.argv) == 2: jobDefIDs = [sys.argv[1]] else: startID = int(sys.argv[1]) endID = int(sys.argv[2]) if startID > endID: print('%d is less than %d' % (endID,startID)) sys.exit(1) jobDefIDs = range(startID,endID+1) # quesry PandaID status, ids = Client.queryPandaIDs(jobDefIDs) if status != 0: sys.exit(0) # remove None while True: if None not in ids: break ids.remove(None) # kill if len(ids) != 0: Client.killJobs(ids)
def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue): # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug('vo={0} label={1} queue={2}'.format( vo, prodSourceLabel, workQueue.queue_name)) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID, tmpInputList in inputList: for taskSpec, cloudName, inputChunk in tmpInputList: # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = ( taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec( datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock == False and prodDBlock != None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if not allRwMap.has_key(jobSpec.currentPriority): tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, jobSpec.currentPriority) if tmpRW == None: tmpLog.error( 'failed to calculate RW with prio={0}'.format( jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI( jobSpec.jediTaskID) if expRW == None: tmpLog.error( 'failed to calculate RW for jediTaskID={0}'.format( jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, None, None) if fullRWs == None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % ( jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap), str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format( len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS, outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS, str(outSs))) # return tmpLog.debug('done') return self.SC_SUCCEEDED
import optparse import pandaserver.userinterface.Client as Client aSrvID = None optP = optparse.OptionParser(conflict_handler="resolve") options,args = optP.parse_args() jediTaskID = args[0] s,o = Client.killTask(jediTaskID) print(o)
status, res = proxyS.querySQLS(sql, varMap) if res is not None: for (id, lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) # reassign jobs.sort() if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print('reassign %s' % str(jobs[iJob:iJob + nJob])) Client.reassignJobs(jobs[iJob:iJob + nJob]) iJob += nJob time.sleep(10) if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print('kill JEDI jobs %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], codeV, keepUnmerged=options.keepUnmerged) iJob += nJob print('\nreassigned {0} jobs'.format(len(jobs + jediJobs)))
srcSQL += ')' jobs = [] tables = [ 'ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsWaiting4', 'ATLAS_PANDA.jobsDefined4' ] for table in tables: sql = "SELECT PandaID FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel IN %s " % ( table, srcSQL) if options.jobID is not None: sql += "AND jobDefinitionID=:jobDefinitionID " if not options.jobsetID in (None, 'all'): sql += "AND jobsetID=:jobsetID " sql += "ORDER BY PandaID " status, res = proxyS.querySQLS(sql, varMap) if res is not None: for id, in res: if not id in jobs: jobs.append(id) if len(jobs): iJob = 0 nJob = 1000 while iJob < len(jobs): subJobs = jobs[iJob:iJob + nJob] print("kill %s %s/%s" % (str(subJobs), iJob, len(jobs))) Client.killJobs(subJobs, code=9) iJob += nJob else: print("no job was killed")
job.cloud = 'US' job.cmtConfig = 'i686-slc4-gcc34-opt' file = FileSpec() file.lfn = "%s.evgen.pool.root" % job.jobName file.destinationDBlock = job.destinationDBlock file.destinationSE = job.destinationSE file.dataset = job.destinationDBlock file.destinationDBlockToken = 'ATLASDATADISK' file.type = 'output' job.addFile(file) fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % job.jobName fileOL.destinationDBlock = job.destinationDBlock fileOL.destinationSE = job.destinationSE fileOL.dataset = job.destinationDBlock fileOL.destinationDBlockToken = 'ATLASDATADISK' fileOL.type = 'log' job.addFile(fileOL) job.jobParameters = "5144 1 5000 1 CSC.005144.PythiaZee.py %s NONE NONE NONE" % file.lfn jobList.append(job) for i in range(1): s, o = Client.submitJobs(jobList) print("---------------------") print(s) for x in o: print("PandaID=%s" % x[0])
def doBrokerage(self, inputList, vo, prodSourceLabel, workQueue, resource_name): # list with a lock inputListWorld = ListWithLock([]) # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug( 'vo={0} label={1} queue={2} resource_name={3} nTasks={4}'.format( vo, prodSourceLabel, workQueue.queue_name, resource_name, len(inputList))) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID, tmpInputList in inputList: for taskSpec, cloudName, inputChunk in tmpInputList: # collect tasks for WORLD if taskSpec.useWorldCloud(): inputListWorld.append((taskSpec, inputChunk)) continue # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = ( taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) // 1024 // 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec( datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock is False and prodDBlock is not None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if jobSpec.currentPriority not in allRwMap: tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, jobSpec.currentPriority) if tmpRW is None: tmpLog.error( 'failed to calculate RW with prio={0}'.format( jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI( jobSpec.jediTaskID) if expRW is None: tmpLog.error( 'failed to calculate RW for jediTaskID={0}'.format( jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # for old clouds if jobSpecList != []: # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI( vo, prodSourceLabel, None, None) if fullRWs is None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % ( jobSpec.metadata, str(rwValues), str(expRWs), str(prioMap), str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format( len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask + maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS, outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS, str(outSs))) # for WORLD if len(inputListWorld) > 0: # thread pool threadPool = ThreadPool() # get full RW for WORLD fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI( vo, prodSourceLabel, None, None) if fullRWs is None: tmpLog.error('failed to calculate full WORLD RW') return retTmpError # get RW per priority for taskSpec, inputChunk in inputListWorld: if taskSpec.currentPriority not in allRwMap: tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI( vo, prodSourceLabel, workQueue, taskSpec.currentPriority) if tmpRW is None: tmpLog.error( 'failed to calculate RW with prio={0}'.format( taskSpec.currentPriority)) return retTmpError allRwMap[taskSpec.currentPriority] = tmpRW # live counter for RWs liveCounter = MapWithLock(allRwMap) # make workers ddmIF = self.ddmIF.getInterface(vo) for iWorker in range(4): thr = AtlasProdTaskBrokerThread(inputListWorld, threadPool, self.taskBufferIF, ddmIF, fullRWs, liveCounter, workQueue) thr.start() threadPool.join(60 * 10) # return tmpLog.debug('doBrokerage done') return self.SC_SUCCEEDED
const=True, dest='modeOn', default=False, help='turn the debug mode on') optP.add_option('--off', action='store_const', const=True, dest='modeOff', default=False, help='turn the debug mode off') options, args = optP.parse_args() if (options.modeOn and options.modeOff) or (not options.modeOn and not options.modeOff): print("ERROR: please set --on or --off") sys.exit(1) if options.modeOn: s, o = Client.setDebugMode(args[0], True) else: s, o = Client.setDebugMode(args[0], False) if o == 'Succeeded': print(o) else: print("ERROR:", o) if s != 0: print("ERROR: communication failure to the panda server") sys.exit(1) sys.exit(0)
def putFile(req, file): tmpLog = LogWrapper(_logger, 'putFile-{}'.format(datetime.datetime.utcnow().isoformat('/'))) if not Protocol.isSecure(req): tmpLog.error('No SSL_CLIENT_S_DN') return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False # user name username = CoreUtils.clean_user_id(req.subprocess_env['SSL_CLIENT_S_DN']) tmpLog.debug("start %s %s" % (username, file.filename)) # size check fullSizeLimit = 768*1024*1024 if not file.filename.startswith('sources.'): noBuild = True sizeLimit = 100*1024*1024 else: noBuild = False sizeLimit = fullSizeLimit # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except Exception: if "content-length" in req.headers_in: tmpLog.error("cannot get CL : %s" % req.headers_in["content-length"]) else: tmpLog.error("no CL") tmpLog.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "ERROR : Upload failure. Exceeded size limit %s>%s." % (contentLength,sizeLimit) if noBuild: errStr += " Please submit the job without --noBuild/--libDS since those options impose a tighter size limit" else: errStr += " Please remove redundant files from your workarea" tmpLog.error(errStr) tmpLog.debug("end") return errStr try: fileName = file.filename.split('/')[-1] fileFullPath = '%s/%s' % (panda_config.cache_dir, fileName) # avoid overwriting if os.path.exists(fileFullPath): # touch os.utime(fileFullPath,None) # send error message errStr = "ERROR : Cannot overwrite file" tmpLog.debug('cannot overwrite file %s' % fileName) tmpLog.debug("end") return errStr # write fo = open(fileFullPath,'wb') fileContent = file.file.read() if hasattr(panda_config, 'compress_file_names') and \ [True for patt in panda_config.compress_file_names.split(',') if re.search(patt, fileName) is not None]: fileContent = gzip.compress(fileContent) fo.write(fileContent) fo.close() except Exception: errStr = "ERROR : Cannot write file" tmpLog.error(errStr) tmpLog.debug("end") return errStr # checksum try: # decode Footer footer = fileContent[-8:] checkSum,isize = struct.unpack("II",footer) tmpLog.debug("CRC from gzip Footer %s" % checkSum) except Exception: # calculate on the fly """ import zlib checkSum = zlib.adler32(fileContent) & 0xFFFFFFFF """ # use None to avoid delay for now checkSum = None tmpLog.debug("CRC calculated %s" % checkSum) # file size fileSize = len(fileContent) tmpLog.debug("written dn=%s file=%s size=%s crc=%s" % \ (username, fileFullPath, fileSize, checkSum)) # put file info to DB if panda_config.record_sandbox_info: to_insert = True for patt in IGNORED_SUFFIX: if file.filename.endswith(patt): to_insert = False break if not to_insert: tmpLog.debug("skipped to insert to DB") else: statClient,outClient = Client.insertSandboxFileInfo(username,file.filename, fileSize,checkSum) if statClient != 0 or outClient.startswith("ERROR"): tmpLog.error("failed to put sandbox to DB with %s %s" % (statClient,outClient)) #_logger.debug("putFile : end") #return "ERROR : Cannot insert sandbox to DB" else: tmpLog.debug("inserted sandbox to DB with %s" % outClient) tmpLog.debug("end") return True
if id not in jobsMap[prio]: jobsMap[prio].append(id) # order by PandaID and currentPriority jobs = [] prioList = list(jobsMap) prioList.sort() for prio in prioList: # reverse order by PandaID to kill newer jobs ids = jobsMap[prio] ids.sort() ids.reverse() jobs += ids if options.maxJobs is not None: jobs = jobs[:int(options.maxJobs)] print('The number of jobs with priorities below %s : %s' % (args[0], len(jobs))) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print('kill %s' % str(jobs[iJob:iJob + nJob])) if options.forceKill: Client.killJobs(jobs[iJob:iJob + nJob], 9) else: Client.killJobs(jobs[iJob:iJob + nJob]) iJob += nJob time.sleep(1)
import argparse from pandaserver.userinterface import Client # parse option parser = argparse.ArgumentParser() parser.add_argument('--panda_id', action='store', dest='panda_id', required=True, help='PandaID of the job') parser.add_argument( '--com_str', action='store', dest='com', required=True, help='The command string passed to the pilot. max 250 chars') options = parser.parse_args() s, o = Client.send_command_to_job(options.panda_id, options.com) if s != 0: print(o) else: if not o[0]: print('ERROR: {}'.format(o[1])) else: print('INFO: {}'.format(o[1]))
def doBrokerage(self,inputList,vo,prodSourceLabel,workQueue): # list with a lock inputListWorld = ListWithLock([]) # variables for submission maxBunchTask = 100 # make logger tmpLog = MsgWrapper(logger) tmpLog.debug('start doBrokerage') # return for failure retFatal = self.SC_FATAL retTmpError = self.SC_FAILED tmpLog.debug('vo={0} label={1} queue={2} nTasks={3}'.format(vo,prodSourceLabel, workQueue.queue_name, len(inputList))) # loop over all tasks allRwMap = {} prioMap = {} tt2Map = {} expRWs = {} jobSpecList = [] for tmpJediTaskID,tmpInputList in inputList: for taskSpec,cloudName,inputChunk in tmpInputList: # collect tasks for WORLD if taskSpec.useWorldCloud(): inputListWorld.append((taskSpec,inputChunk)) continue # make JobSpec to be submitted for TaskAssigner jobSpec = JobSpec() jobSpec.taskID = taskSpec.jediTaskID jobSpec.jediTaskID = taskSpec.jediTaskID # set managed to trigger TA jobSpec.prodSourceLabel = 'managed' jobSpec.processingType = taskSpec.processingType jobSpec.workingGroup = taskSpec.workingGroup jobSpec.metadata = taskSpec.processingType jobSpec.assignedPriority = taskSpec.taskPriority jobSpec.currentPriority = taskSpec.currentPriority jobSpec.maxDiskCount = (taskSpec.getOutDiskSize() + taskSpec.getWorkDiskSize()) / 1024 / 1024 if taskSpec.useWorldCloud(): # use destinationSE to trigger task brokerage in WORLD cloud jobSpec.destinationSE = taskSpec.cloud prodDBlock = None setProdDBlock = False for datasetSpec in inputChunk.getDatasets(): prodDBlock = datasetSpec.datasetName if datasetSpec.isMaster(): jobSpec.prodDBlock = datasetSpec.datasetName setProdDBlock = True for fileSpec in datasetSpec.Files: tmpInFileSpec = fileSpec.convertToJobFileSpec(datasetSpec) jobSpec.addFile(tmpInFileSpec) # use secondary dataset name as prodDBlock if setProdDBlock == False and prodDBlock != None: jobSpec.prodDBlock = prodDBlock # append jobSpecList.append(jobSpec) prioMap[jobSpec.taskID] = jobSpec.currentPriority tt2Map[jobSpec.taskID] = jobSpec.processingType # get RW for a priority if not allRwMap.has_key(jobSpec.currentPriority): tmpRW = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,workQueue, jobSpec.currentPriority) if tmpRW == None: tmpLog.error('failed to calculate RW with prio={0}'.format(jobSpec.currentPriority)) return retTmpError allRwMap[jobSpec.currentPriority] = tmpRW # get expected RW expRW = self.taskBufferIF.calculateTaskRW_JEDI(jobSpec.jediTaskID) if expRW == None: tmpLog.error('failed to calculate RW for jediTaskID={0}'.format(jobSpec.jediTaskID)) return retTmpError expRWs[jobSpec.taskID] = expRW # for old clouds if jobSpecList != []: # get fullRWs fullRWs = self.taskBufferIF.calculateRWwithPrio_JEDI(vo,prodSourceLabel,None,None) if fullRWs == None: tmpLog.error('failed to calculate full RW') return retTmpError # set metadata for jobSpec in jobSpecList: rwValues = allRwMap[jobSpec.currentPriority] jobSpec.metadata = "%s;%s;%s;%s;%s;%s" % (jobSpec.metadata, str(rwValues),str(expRWs), str(prioMap),str(fullRWs), str(tt2Map)) tmpLog.debug('run task assigner for {0} tasks'.format(len(jobSpecList))) nBunchTask = 0 while nBunchTask < len(jobSpecList): # get a bunch jobsBunch = jobSpecList[nBunchTask:nBunchTask+maxBunchTask] strIDs = 'jediTaskID=' for tmpJobSpec in jobsBunch: strIDs += '{0},'.format(tmpJobSpec.taskID) strIDs = strIDs[:-1] tmpLog.debug(strIDs) # increment index nBunchTask += maxBunchTask # run task brokerge stS,outSs = PandaClient.runTaskAssignment(jobsBunch) tmpLog.debug('{0}:{1}'.format(stS,str(outSs))) # for WORLD if len(inputListWorld) > 0: # thread pool threadPool = ThreadPool() # get full RW for WORLD fullRWs = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,None,None) if fullRWs == None: tmpLog.error('failed to calculate full WORLD RW') return retTmpError # get RW per priority for taskSpec,inputChunk in inputListWorld: if not taskSpec.currentPriority in allRwMap: tmpRW = self.taskBufferIF.calculateWorldRWwithPrio_JEDI(vo,prodSourceLabel,workQueue, taskSpec.currentPriority) if tmpRW == None: tmpLog.error('failed to calculate RW with prio={0}'.format(taskSpec.currentPriority)) return retTmpError allRwMap[taskSpec.currentPriority] = tmpRW # live counter for RWs liveCounter = MapWithLock(allRwMap) # make workers ddmIF = self.ddmIF.getInterface(vo) for iWorker in range(4): thr = AtlasProdTaskBrokerThread(inputListWorld,threadPool, self.taskBufferIF,ddmIF, fullRWs,liveCounter) thr.start() threadPool.join(60*10) # return tmpLog.debug('doBrokerage done') return self.SC_SUCCEEDED
#!/usr/bin/python from __future__ import print_function from pprint import pprint import json, sys import requests import cPickle as pickle from datetime import datetime from pandaserver.taskbuffer import JobSpec from pandaserver.userinterface import Client # this is an example job = Client.getJobStatus([4242299116]) spec = job[1][0] att = spec.valuesMap() pprint(att)