def getSiteMapper(self): timeNow = datetime.datetime.utcnow() if datetime.datetime.utcnow( ) - self.dateTimeForSM > datetime.timedelta(minutes=10): self.siteMapper = SiteMapper(self) self.dateTimeForSM = timeNow return self.siteMapper
def __init__(self, conn): CommandReceiveInterface.__init__(self, conn) TaskBuffer.TaskBuffer.__init__(self) TaskBuffer.TaskBuffer.init(self, jedi_config.db.dbhost, jedi_config.db.dbpasswd, nDBConnection=1) # site mapper self.siteMapper = SiteMapper(self) # update time for site mapper self.dateTimeForSM = datetime.datetime.utcnow() logger.debug('__init__')
startTime = datetime.datetime( *time.strptime(timeM.group(1), '%b %d %H:%M:%S %Y')[:6]) # kill old process if startTime < timeLimit: tmpLog.debug("old process : %s %s" % (pid, startTime)) tmpLog.debug(line) commands_get_status_output('kill -9 %s' % pid) except Exception: type, value, traceBack = sys.exc_info() tmpLog.error("kill process : %s %s" % (type, value)) # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # delete tmpLog.debug("Del session") status, retSel = taskBuffer.querySQLS( "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {}) if retSel is not None: try: maxID = retSel[0][0] tmpLog.debug("maxID : %s" % maxID) if maxID is not None: varMap = {} varMap[':maxID'] = maxID varMap[':jobStatus1'] = 'activated' varMap[':jobStatus2'] = 'waiting' varMap[':jobStatus3'] = 'failed'
def main(backGround=False): _logger.debug('starting ...') # register signal handler signal.signal(signal.SIGINT, catch_sig) signal.signal(signal.SIGHUP, catch_sig) signal.signal(signal.SIGTERM, catch_sig) signal.signal(signal.SIGALRM, catch_sig) signal.alarm(overallTimeout) # forking pid = os.fork() if pid != 0: # watch child process os.wait() time.sleep(1) else: # main loop from pandaserver.taskbuffer.TaskBuffer import taskBuffer # check certificate certName = '%s/pandasv1_usercert.pem' % panda_config.certdir keyName = '%s/pandasv1_userkey.pem' % panda_config.certdir _logger.debug('checking certificate {0}'.format(certName)) certOK, certMsg = DataServiceUtils.checkCertificate(certName) if not certOK: _logger.error('bad certificate : {0}'.format(certMsg)) # initialize cx_Oracle using dummy connection from pandaserver.taskbuffer.Initializer import initializer initializer.init() # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # ActiveMQ params queue = '/queue/Consumer.PANDA.atlas.ddm.siteservices' ssl_opts = { 'use_ssl': True, 'ssl_version': ssl.PROTOCOL_TLSv1, 'ssl_cert_file': certName, 'ssl_key_file': keyName } # resolve multiple brokers brokerList = socket.gethostbyname_ex('atlas-mb.cern.ch')[-1] # set listener connList = [] for tmpBroker in brokerList: try: clientid = 'PANDA-' + socket.getfqdn() + '-' + tmpBroker subscription_id = 'panda-server-consumer-' + socket.getfqdn() _logger.debug('setting listener %s' % clientid) conn = stomp.Connection(host_and_ports=[(tmpBroker, 61023)], **ssl_opts) connList.append(conn) except Exception: errtype, errvalue = sys.exc_info()[:2] _logger.error("failed to connect to %s : %s %s" % (tmpBroker, errtype, errvalue)) catch_sig(None, None) while True: for conn in connList: try: if not conn.is_connected(): conn.set_listener( 'FileCallbackListener', FileCallbackListener(conn, taskBuffer, siteMapper, subscription_id)) conn.start() conn.connect(headers={'client-id': clientid}) conn.subscribe(destination=queue, id=subscription_id, ack='client-individual') _logger.debug('listener %s is up and running' % clientid) except Exception: errtype, errvalue = sys.exc_info()[:2] _logger.error("failed to set listener on %s : %s %s" % (tmpBroker, errtype, errvalue)) catch_sig(None, None) time.sleep(5)
# password from pandaserver.config import panda_config passwd = panda_config.dbpasswd # logger _logger = PandaLogger().getLogger('prioryMassage') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # get usage breakdown usageBreakDownPerUser = {} usageBreakDownPerSite = {} workingGroupList = [] for table in ['ATLAS_PANDA.jobsActive4', 'ATLAS_PANDA.jobsArchived4']: varMap = {} varMap[':prodSourceLabel'] = 'user' varMap[':pmerge'] = 'pmerge' if table == 'ATLAS_PANDA.jobsActive4': sql = "SELECT COUNT(*),prodUserName,jobStatus,workingGroup,computingSite FROM %s WHERE prodSourceLabel=:prodSourceLabel AND processingType<>:pmerge GROUP BY prodUserName,jobStatus,workingGroup,computingSite" % table else: # with time range for archived table varMap[':modificationTime'] = datetime.datetime.utcnow( ) - datetime.timedelta(minutes=60)
def main(argv=tuple(), tbuf=None, **kwargs): try: long except NameError: long = int prelock_pid = GenericThread().get_pid() tmpLog = LogWrapper(_logger, "<pid={}>".format(prelock_pid)) tmpLog.debug("===================== start =====================") # return value, true to run main again in next daemon loop ret_val = True # grace period try: gracePeriod = int(argv[1]) except Exception: gracePeriod = 1 # lock interval in minutes lock_interval = 10 # retry interval in minutes retry_interval = 3 # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # thread for adder class AdderThread(GenericThread): def __init__(self, taskBuffer, aSiteMapper, job_output_reports): GenericThread.__init__(self) self.taskBuffer = taskBuffer self.aSiteMapper = aSiteMapper self.job_output_reports = job_output_reports # main loop def run(self): # initialize taskBuffer = self.taskBuffer aSiteMapper = self.aSiteMapper # get file list timeNow = datetime.datetime.utcnow() timeInt = datetime.datetime.utcnow() # unique pid GenericThread.__init__(self) uniq_pid = self.get_pid() # log pid tmpLog.debug("pid={0} : run".format(uniq_pid)) # stats n_processed = 0 # loop while True: # get report one_jor = self.job_output_reports.pop() if not one_jor: break # lock panda_id, job_status, attempt_nr, time_stamp = one_jor got_lock = taskBuffer.lockJobOutputReport( panda_id=panda_id, attempt_nr=attempt_nr, pid=uniq_pid, time_limit=lock_interval) if not got_lock: continue # add try: modTime = time_stamp if (timeNow - modTime) > datetime.timedelta(hours=24): # last add tmpLog.debug( "pid={0} : last add job={1}.{2} st={3}".format( uniq_pid, panda_id, attempt_nr, job_status)) ignoreTmpError = False else: # usual add tmpLog.debug("pid={0} : add job={1}.{2} st={3}".format( uniq_pid, panda_id, attempt_nr, job_status)) ignoreTmpError = True # get adder adder_gen = AdderGen(taskBuffer, panda_id, job_status, attempt_nr, ignoreTmpError=ignoreTmpError, siteMapper=aSiteMapper, pid=uniq_pid, prelock_pid=uniq_pid, lock_offset=lock_interval - retry_interval) n_processed += 1 # execute adder_gen.run() del adder_gen except Exception as e: tmpLog.error("pid={} : failed to run with {} {}".format( uniq_pid, str(e), traceback.format_exc())) # stats tmpLog.debug("pid={} : processed {}".format(uniq_pid, n_processed)) # launcher, run with multiprocessing def proc_launch(self): # run self.process = multiprocessing.Process(target=self.run) self.process.start() # join of multiprocessing def proc_join(self): self.process.join() # TaskBuffer with more connections behind TaskBufferInterface tmpLog.debug("setup taskBufferIF") n_connections = 4 _tbuf = TaskBuffer() _tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=n_connections) taskBufferIF = TaskBufferInterface() taskBufferIF.launch(_tbuf) # add files tmpLog.debug("run Adder") interval = 10 nLoop = 10 for iLoop in range(10): tmpLog.debug('start iLoop={}/{}'.format(iLoop, nLoop)) start_time = datetime.datetime.utcnow() adderThrList = [] nThr = 10 n_jors_per_batch = 1000 jor_lists = WeightedLists(multiprocessing.Lock()) # get some job output reports jor_list_others = taskBuffer.listJobOutputReport( only_unlocked=True, time_limit=lock_interval, limit=n_jors_per_batch * nThr, grace_period=gracePeriod, anti_labels=['user']) jor_lists.add(3, jor_list_others) jor_list_user = taskBuffer.listJobOutputReport( only_unlocked=True, time_limit=lock_interval, limit=n_jors_per_batch * nThr, grace_period=gracePeriod, labels=['user']) jor_lists.add(7, jor_list_user) # adder consumer processes _n_thr_with_tbuf = 0 tbuf_list = [] tmpLog.debug("got {} job reports".format(len(jor_lists))) for i in range(nThr): if i < _n_thr_with_tbuf: tbuf = TaskBuffer() tbuf_list.append(tbuf) tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) thr = AdderThread(tbuf, aSiteMapper, jor_lists) else: thr = AdderThread(taskBufferIF.getInterface(), aSiteMapper, jor_lists) adderThrList.append(thr) # start all threads for thr in adderThrList: # thr.start() thr.proc_launch() time.sleep(0.25) # join all threads for thr in adderThrList: # thr.join() thr.proc_join() [tbuf.cleanup() for tbuf in tbuf_list] end_time = datetime.datetime.utcnow() sleep_time = interval - (end_time - start_time).seconds if sleep_time > 0 and iLoop + 1 < nLoop: sleep_time = random.randint(1, sleep_time) tmpLog.debug("sleep {} sec".format(sleep_time)) time.sleep(sleep_time) # stop TaskBuffer IF taskBufferIF.stop() tmpLog.debug("===================== end =====================") # return return ret_val
except ImportError: from http.client import HTTPSConnection import pandaserver.userinterface.Client as Client from pandaserver.userinterface.Client import baseURLSSL from pandaserver.taskbuffer.TaskBuffer import taskBuffer from pandaserver.brokerage.SiteMapper import SiteMapper from pandaserver.config import panda_config from pandaserver.dataservice import DataServiceUtils from pandasever.dataservice.DataServiceUtils import select_scope # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) id = sys.argv[1] s, o = Client.getJobStatus([id]) if s != 0: print("failed to get job with:%s" % s) sys.exit(0) job = o[0] if job is None: print("got None") sys.exit(0) xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
def run(self): # start try: byCallback = False if self.job is None: byCallback = True _logger.debug("start: %s" % self.dataset.name) _logger.debug("callback from %s" % self.site) # FIXME when callback from BNLPANDA disappeared if self.site == 'BNLPANDA': self.site = 'BNL-OSG2_ATLASMCDISK' # instantiate site mapper siteMapper = SiteMapper(self.taskBuffer) # get computingSite/destinationSE computingSite, destinationSE = self.taskBuffer.getDestSE( self.dataset.name) if destinationSE is None: # try to get computingSite/destinationSE from ARCH to delete sub # even if no active jobs left computingSite, destinationSE = self.taskBuffer.getDestSE( self.dataset.name, True) if destinationSE is None: _logger.error("cannot get source/destination for %s" % self.dataset.name) _logger.debug("end: %s" % self.dataset.name) return _logger.debug("src: %s" % computingSite) _logger.debug("dst: %s" % destinationSE) # get corresponding token tmpSrcSiteSpec = siteMapper.getSite(computingSite) tmpDstSiteSpec = siteMapper.getSite(destinationSE) _logger.debug(tmpDstSiteSpec.setokens_output) destToken = None for scope in tmpDstSiteSpec.setokens_output: for setoken in tmpDstSiteSpec.setokens_output[scope]: for tmpDdmId in tmpDstSiteSpec.setokens_output[scope][ setoken]: if self.site == tmpDdmId: destToken = setoken break _logger.debug("use Token=%s" % destToken) # get required tokens reqTokens = self.taskBuffer.getDestTokens(self.dataset.name) if reqTokens is None: _logger.error("cannot get required token for %s" % self.dataset.name) _logger.debug("end: %s" % self.dataset.name) return _logger.debug("req Token=%s" % reqTokens) # make bitmap for the token bitMap = 1 if len(reqTokens.split(',')) > 1: for tmpReqToken in reqTokens.split(','): if tmpReqToken == destToken: break # shift one bit bitMap <<= 1 # completed bitmap compBitMap = (1 << len(reqTokens.split(','))) - 1 # ignore the lowest bit for T1, file on DISK is already there # TODO: #prodanaly use the scope, but don't know job information if tmpSrcSiteSpec.ddm_output == tmpDstSiteSpec.ddm_output: compBitMap = compBitMap & 0xFFFE # update bitmap in DB updatedBitMap = self.taskBuffer.updateTransferStatus( self.dataset.name, bitMap) _logger.debug( "transfer status:%s - comp:%s - bit:%s" % (hex(updatedBitMap), hex(compBitMap), hex(bitMap))) # update output files if (updatedBitMap & compBitMap) == compBitMap: ids = self.taskBuffer.updateOutFilesReturnPandaIDs( self.dataset.name) # set flag for T2 cleanup self.dataset.status = 'cleanup' self.taskBuffer.updateDatasets([self.dataset]) else: _logger.debug("end: %s" % self.dataset.name) return else: _logger.debug("start: %s" % self.job.PandaID) # update input files ids = [self.job.PandaID] _logger.debug("IDs: %s" % ids) if len(ids) != 0: # get job if self.job is None: jobs = self.taskBuffer.peekJobs(ids, fromDefined=False, fromArchived=False, fromWaiting=False) else: jobs = [self.job] # loop over all jobs for job in jobs: if job is None: continue _logger.debug("Job: %s" % job.PandaID) if job.jobStatus == 'transferring': jobReady = True failedFiles = [] noOutFiles = [] # check file status for file in job.Files: if file.type == 'output' or file.type == 'log': if file.status == 'failed': failedFiles.append(file.lfn) elif file.status == 'nooutput': noOutFiles.append(file.lfn) elif file.status != 'ready': _logger.debug( "Job: %s file:%s %s != ready" % (job.PandaID, file.lfn, file.status)) jobReady = False break # finish job if jobReady: if byCallback: _logger.debug("Job: %s all files ready" % job.PandaID) else: _logger.debug( "Job: %s all files checked with catalog" % job.PandaID) # create XML try: import xml.dom.minidom dom = xml.dom.minidom.getDOMImplementation() doc = dom.createDocument(None, 'xml', None) topNode = doc.createElement("POOLFILECATALOG") for file in job.Files: if file.type in ['output', 'log']: # skip failed or no-output files if file.lfn in failedFiles + noOutFiles: continue # File fileNode = doc.createElement("File") fileNode.setAttribute("ID", file.GUID) # LFN logNode = doc.createElement("logical") lfnNode = doc.createElement("lfn") lfnNode.setAttribute('name', file.lfn) # metadata fsizeNode = doc.createElement( "metadata") fsizeNode.setAttribute( "att_name", "fsize") fsizeNode.setAttribute( "att_value", str(file.fsize)) # checksum if file.checksum.startswith('ad:'): # adler32 chksumNode = doc.createElement( "metadata") chksumNode.setAttribute( "att_name", "adler32") chksumNode.setAttribute( "att_value", re.sub('^ad:', '', file.checksum)) else: # md5sum chksumNode = doc.createElement( "metadata") chksumNode.setAttribute( "att_name", "md5sum") chksumNode.setAttribute( "att_value", re.sub('^md5:', '', file.checksum)) # append nodes logNode.appendChild(lfnNode) fileNode.appendChild(logNode) fileNode.appendChild(fsizeNode) fileNode.appendChild(chksumNode) topNode.appendChild(fileNode) # status of the job record if failedFiles == []: record_status = 'finished' else: record_status = 'failed' # write to file # xmlFile = '%s/%s_%s_%s' % (panda_config.logdir,job.PandaID,record_status, # str(uuid.uuid4())) # oXML = open(xmlFile,"w") # oXML.write(topNode.toxml()) # oXML.close() # write to job output report table, try update first tmp_ret = self.taskBuffer.updateJobOutputReport( panda_id=job.PandaID, attempt_nr=job.attemptNr, data=topNode.toxml()) if not tmp_ret: # then try insert self.taskBuffer.insertJobOutputReport( panda_id=job.PandaID, prod_source_label=job.prodSourceLabel, job_status=record_status, attempt_nr=job.attemptNr, data=topNode.toxml()) except Exception: type, value, traceBack = sys.exc_info() _logger.error("Job: %s %s %s" % (job.PandaID, type, value)) _logger.debug("Job: %s status: %s" % (job.PandaID, job.jobStatus)) # end if self.job is None: _logger.debug("end: %s" % self.dataset.name) else: _logger.debug("end: %s" % self.job.PandaID) except Exception: type, value, traceBack = sys.exc_info() _logger.error("run() : %s %s" % (type, value))
from pandaserver.taskbuffer import ErrorCode # password from pandaserver.config import panda_config # logger _logger = PandaLogger().getLogger('esPreemption') tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=15) # get low priority ES jobs per site sqlEsJobs = "SELECT PandaID,computingSite,commandToPilot,startTime " sqlEsJobs += "FROM {0}.jobsActive4 ".format(panda_config.schemaPANDA) sqlEsJobs += "WHERE prodSourceLabel IN (:label1,:label2) AND eventService=:es " sqlEsJobs += "AND currentPriority<:prio AND jobStatus=:jobStat " sqlEsJobs += "ORDER BY currentPriority,PandaID " varMap = {} varMap[':label1'] = 'managed' varMap[':label2'] = 'test' varMap[':es'] = 1
def main(argv=tuple(), tbuf=None, **kwargs): try: long except NameError: long = int tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # current minute currentMinute = datetime.datetime.utcnow().minute # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # delete tmpLog.debug("Del session") status, retSel = taskBuffer.querySQLS( "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {}) if retSel is not None: try: maxID = retSel[0][0] tmpLog.debug("maxID : %s" % maxID) if maxID is not None: varMap = {} varMap[':maxID'] = maxID varMap[':jobStatus1'] = 'activated' varMap[':jobStatus2'] = 'waiting' varMap[':jobStatus3'] = 'failed' varMap[':jobStatus4'] = 'cancelled' status, retDel = taskBuffer.querySQLS( "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)", varMap) except Exception: pass # count # of getJob/updateJob in dispatcher's log try: # don't update when logrotate is running timeNow = datetime.datetime.utcnow() logRotateTime = timeNow.replace(hour=3, minute=2, second=0, microsecond=0) if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \ (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)): tmpLog.debug("skip pilotCounts session for logrotate") else: # log filename dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( hours=3) timeLimitS = datetime.datetime.utcnow() - datetime.timedelta( hours=1) # check if tgz is required com = 'head -1 %s' % dispLogName lostat, loout = commands_get_status_output(com) useLogTgz = True if lostat == 0: match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', loout) if match is not None: startTime = datetime.datetime(*time.strptime( match.group(0), '%Y-%m-%d %H:%M:%S')[:6]) # current log contains all info if startTime < timeLimit: useLogTgz = False # log files dispLogNameList = [dispLogName] if useLogTgz: today = datetime.date.today() dispLogNameList.append('{0}-{1}.gz'.format( dispLogName, today.strftime('%Y%m%d'))) # delete tmp commands_get_status_output('rm -f %s.tmp-*' % dispLogName) # tmp name tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow( ).strftime('%Y-%m-%d-%H-%M-%S')) # loop over all files pilotCounts = {} pilotCountsS = {} for tmpDispLogName in dispLogNameList: # expand or copy if tmpDispLogName.endswith('.gz'): com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName) else: com = 'cp %s %s' % (tmpDispLogName, tmpLogName) lostat, loout = commands_get_status_output(com) if lostat != 0: errMsg = 'failed to expand/copy %s with : %s' % ( tmpDispLogName, loout) raise RuntimeError(errMsg) # search string sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*' sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)' # read logFH = open(tmpLogName) for line in logFH: # check format match = re.search(sStr, line) if match is not None: # check timerange timeStamp = datetime.datetime(*time.strptime( match.group(1), '%Y-%m-%d %H:%M:%S')[:6]) if timeStamp < timeLimit: continue tmpMethod = match.group(2) tmpSite = match.group(3) tmpNode = match.group(4) tmpType = match.group(5) # protection against corrupted entries from pilot, # e.g. pilot reading site json from cvmfs while it was being updated if tmpSite not in aSiteMapper.siteSpecList: continue # sum pilotCounts.setdefault(tmpSite, {}) pilotCounts[tmpSite].setdefault(tmpMethod, {}) pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0) pilotCounts[tmpSite][tmpMethod][tmpNode] += 1 # short if timeStamp > timeLimitS: if tmpSite not in pilotCountsS: pilotCountsS[tmpSite] = dict() if tmpMethod not in pilotCountsS[tmpSite]: pilotCountsS[tmpSite][tmpMethod] = dict() if tmpNode not in pilotCountsS[tmpSite][tmpMethod]: pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0 pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1 # close logFH.close() # delete tmp commands_get_status_output('rm %s' % tmpLogName) # update hostID = panda_config.pserverhost.split('.')[0] tmpLog.debug("pilotCounts session") retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3) tmpLog.debug(retPC) retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1) tmpLog.debug(retPC) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue)) # nRunning tmpLog.debug("nRunning session") try: if (currentMinute / panda_config.nrun_interval ) % panda_config.nrun_hosts == panda_config.nrun_snum: retNR = taskBuffer.insertnRunningInSiteData() tmpLog.debug(retNR) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("nRunning : %s %s" % (errType, errValue)) # session for co-jumbo jobs tmpLog.debug("co-jumbo session") try: ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000) if ret is None: tmpLog.debug("failed to get co-jumbo jobs to finish") else: coJumboA, coJumboD, coJumboW, coJumboTokill = ret tmpLog.debug("finish {0} co-jumbo jobs in Active".format( len(coJumboA))) if len(coJumboA) > 0: jobSpecs = taskBuffer.peekJobs(coJumboA, fromDefined=False, fromActive=True, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False) tmpLog.debug("finish {0} co-jumbo jobs in Defined".format( len(coJumboD))) if len(coJumboD) > 0: jobSpecs = taskBuffer.peekJobs(coJumboD, fromDefined=True, fromActive=False, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], True) tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format( len(coJumboW))) if len(coJumboW) > 0: jobSpecs = taskBuffer.peekJobs(coJumboW, fromDefined=False, fromActive=False, fromArchived=False, fromWaiting=True) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except Exception: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName def run(self): if 'VIRTUAL_ENV' in os.environ: prefix = os.environ['VIRTUAL_ENV'] else: prefix = '' setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix) runStr = '%s/python -Wignore ' % panda_config.native_python runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i ' runStr += self.fileName if self.fileName.split('/')[-1].startswith('set.NULL.'): runStr += ' -t' comStr = setupStr + runStr tmpLog.debug(comStr) commands_get_status_output(comStr) # get set.* files filePatt = panda_config.logdir + '/' + 'set.*' fileList = glob.glob(filePatt) # the max number of threads maxThr = 10 nThr = 0 # loop over all files forkThrList = [] timeNow = datetime.datetime.utcnow() for tmpName in fileList: if not os.path.exists(tmpName): continue try: # takes care of only recent files modTime = datetime.datetime( *(time.gmtime(os.path.getmtime(tmpName))[:7])) if (timeNow - modTime) > datetime.timedelta(minutes=1) and \ (timeNow - modTime) < datetime.timedelta(hours=1): cSt, cOut = commands_get_status_output( 'ps aux | grep fork | grep -v PYTH') # if no process is running for the file if cSt == 0 and tmpName not in cOut: nThr += 1 thr = ForkThr(tmpName) thr.start() forkThrList.append(thr) if nThr > maxThr: break except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("%s %s" % (errType, errValue)) # join fork threads for thr in forkThrList: thr.join() # terminate TaskBuffer IF # taskBufferIF.terminate() tmpLog.debug("===================== end =====================")