def __init__(self, ceflavour=['ARC-CE']): # Get agent name from /path/to/aCTAgent.py self.name = os.path.basename(sys.argv[0])[:-3] # logger self.logger = aCTLogger.aCTLogger(self.name) self.log = self.logger() self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() # config self.conf = aCTConfig.aCTConfigAPP() self.arcconf = aCTConfig.aCTConfigARC() self.tmpdir = str(self.arcconf.get(['tmp', 'dir'])) # database self.dbarc = aCTDBArc.aCTDBArc(self.log) self.dbcondor = aCTDBCondor.aCTDBCondor(self.log) self.dbpanda = aCTDBPanda.aCTDBPanda(self.log) # APFMon self.apfmon = aCTAPFMon.aCTAPFMon(self.conf) # CRIC info self.flavour = ceflavour self.cricparser = aCTCRICParser.aCTCRICParser(self.log) self.sites = {} self.osmap = {} self.sitesselect = '' # start time for periodic restart self.starttime = time.time() self.log.info("Started %s", self.name)
def __init__(self): self.conf = aCTConfig.aCTConfigARC() self.logger = aCTLogger.aCTLogger("aCTReport") self.log = self.logger() self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() #self.db=aCTDB.aCTDB(None,self.conf.get(["db","file"])) self.db = aCTDBArc.aCTDBArc(self.log, self.conf.get(["db", "file"])) self.pandadb = aCTDBPanda.aCTDBPanda(self.log, self.conf.get(["db", "file"]))
def bootstrap(): logger = aCTLogger('aCTBootstrap') log = logger() dbpanda = aCTDBPanda(log) if not dbpanda.createTables(): print('Failed to create Panda tables, see aCTBootstrap.log for details')
def __init__(self, args): self.output = "" self.outfile = args.web self.actconfs = args.conffiles or [''] # empty string for default behaviour self.logger=aCTLogger.aCTLogger("aCTReport") self.actlog=self.logger() self.actlog.logger.setLevel(logging.INFO) self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() if self.outfile: self.log('<META HTTP-EQUIV="refresh" CONTENT="60"><pre>') self.log(time.asctime() + '\n') self.db=aCTDBArc.aCTDBArc(self.actlog)
def bootstrap(): logger = aCTLogger('aCTBootstrap') log = logger() clientdb = ClientDB(log) if not clientdb.createTables(): print('Error creating client tables, see aCTBootstrap.log for details')
def bootstrap(): logger = aCTLogger('aCTBootstrap') log = logger() dbldmx = aCTDBLDMX(log) if not dbldmx.createTables(): print('Failed to create LDMX tables, see aCTBootstrap.log for details')
def __init__(self, conf, log=None): if log: self.log = log else: self.logger = aCTLogger(self.__class__.__name__, arclog=False) self.log = self.logger() self.apfmonurl = conf.get(["monitor", "apfmon"]) self.sendupdates = conf.get(["monitor", "update"]) self.acturl = conf.get(["joblog", "urlprefix"]) self.factory = conf.get(["panda", "schedulerid"])
def bootstrap_db(): '''Set up the ARC and Condor DB tables''' logger = aCTLogger('aCTBootstrap') log = logger() dbarc = aCTDBArc(log) dbcondor = aCTDBCondor(log) print('Setting up ARC tables...') if not dbarc.createTables(): print('Error creating arc tables, see aCTBootstrap.log for details') print('Setting up Condor tables...') if not dbcondor.createTables(): print('Error creating condor tables, see aCTBootstrap.log for details')
def __init__(self): # Get agent name from /path/to/aCTAgent.py self.name = os.path.basename(sys.argv[0])[:-3] # logger self.logger = aCTLogger.aCTLogger(self.name) self.log = self.logger() self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() # config self.conf = aCTConfig.aCTConfigATLAS() self.arcconf = aCTConfig.aCTConfigARC() # database self.dbarc = aCTDBArc.aCTDBArc(self.log, self.conf.get(["db", "file"])) self.dbpanda = aCTDBPanda.aCTDBPanda(self.log, self.conf.get(["db", "file"])) # start time for periodic restart self.starttime = time.time() self.log.info("Started %s", self.name)
def __init__(self): """Initialize all attributes.""" # get name, remove .py from the end self.name = os.path.basename(sys.argv[0])[:-3] self.arcconf = aCTConfig.aCTConfigARC() self.logger = aCTLogger.aCTLogger(self.name) self.log = self.logger() self.clidb = clientdb.ClientDB(self.log) self.arcdb = aCTDBArc.aCTDBArc(self.log) self.log.info('Started {}'.format(self.name))
def __init__(self): # Get agent name from /path/to/aCTAgent.py self.name = os.path.basename(sys.argv[0])[:-3] # logger self.logger = aCTLogger.aCTLogger(self.name) self.log = self.logger() self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() # config self.conf = aCTConfig.aCTConfigAPP() self.arcconf = aCTConfig.aCTConfigARC() self.tmpdir = str(self.arcconf.get(['tmp', 'dir'])) # database self.dbarc = aCTDBArc.aCTDBArc(self.log) self.dbldmx = aCTDBLDMX.aCTDBLDMX(self.log) # Rucio client self.rucio = Client() # start time for periodic restart self.starttime = time.time() self.log.info("Started %s", self.name)
def main(): if len(sys.argv) != 2: print("Usage: python aCTHeartbeatWatchdog.py timelimit") sys.exit(1) timelimit = int(sys.argv[1]) # logger logger = aCTLogger('aCTHeartbeatWatchdog') log = logger() # database dbarc = aCTDBArc(log) dbpanda = aCTDBPanda(log) # Query for running jobs with theartbeat longer than timelimit seconds ago select = "sendhb=1 and " \ "pandastatus in ('sent', 'starting', 'running', 'transferring') and " \ "theartbeat != 0 and " + dbpanda.timeStampLessThan("theartbeat", timelimit) columns = ['pandaid', 'pandastatus', 'proxyid', 'sitename', 'theartbeat'] jobs = dbpanda.getJobs(select, columns) if jobs: print( 'Found %d jobs with outdated heartbeat (older than %d seconds):\n' % (len(jobs), timelimit)) print('\t'.join( ['pandaid', 'site', 'status', 'theartbeat', 'Panda response'])) # Panda server for each proxy pandas = {} for job in jobs: proxyid = job['proxyid'] if proxyid not in pandas: panda = aCTPanda(log, dbarc.getProxyPath(proxyid)) pandas[proxyid] = panda response = pandas[proxyid].updateStatus(job['pandaid'], job['pandastatus']) print('\t'.join([ str(job['pandaid']), job['sitename'], job['pandastatus'], str(job['theartbeat']), str(response) ])) # update heartbeat time in the DB dbpanda.updateJob( job['pandaid'], {'theartbeat': dbpanda.getTimeStamp(time.time() + 1)})
def bootstrap_db(): '''Set up the DB tables''' # TODO: setup only what is needed based on config and app logger = aCTLogger('aCTBootstrap') log = logger() dbarc = aCTDBArc(log) dbclient = ClientDB(log) dbcondor = aCTDBCondor(log) dbpanda = aCTDBPanda(log) if not dbarc.createTables(): print('Error creating arc tables, see aCTBootstrap.log for details') if not dbclient.createTables(): print('Error creating client tables, see aCTBootstrap.log for details') if not dbcondor.createTables(): print('Error creating condor tables, see aCTBootstrap.log for details') if not dbpanda.createTables(): print('Error creating panda tables, see aCTBootstrap.log for details')
def __init__(self, args): # Check we have the right ARC version self.checkARC() # xml config file self.conf = aCTConfig.aCTConfigARC() self.appconf = aCTConfig.aCTConfigAPP() # Create required directories tmpdir = self.conf.get(["tmp", "dir"]) self.makeDirs(tmpdir) self.makeDirs(os.path.join(tmpdir, 'inputfiles')) self.makeDirs(os.path.join(tmpdir, 'eventranges')) self.makeDirs(os.path.join(tmpdir, 'failedlogs')) self.makeDirs(self.conf.get(["voms", "proxystoredir"]), 0o700) self.makeDirs(self.conf.get(["logger", "logdir"])) # logger self.logger = aCTLogger.aCTLogger("aCTMain") self.log = self.logger() # Check if we should run self.shouldrun = not os.path.exists( os.path.join(self.conf.get(["actlocation", "dir"]), "act.stop")) if not self.shouldrun: self.log.warning( "Detected act.stop file, won't start child processes") # daemon operations if len(args) >= 2: self.daemon(args[1]) # process manager try: if self.shouldrun: self.procmanager = aCTProcessManager.aCTProcessManager( self.log, self.conf, self.appconf) except Exception as e: self.log.critical("*** Unexpected exception! ***") self.log.critical(traceback.format_exc()) self.log.critical("*** Process exiting ***") raise e
import subprocess import sys import time from datetime import datetime from act.arc.aCTDBArc import aCTDBArc from act.atlas.aCTDBPanda import aCTDBPanda from act.common.aCTLogger import aCTLogger from act.common.aCTConfig import aCTConfigARC try: service_id, webpage_url = sys.argv[1:3] except: print('Usage: kibana.py service_id webpage_url') sys.exit(1) logger = aCTLogger('kibana probe') log = logger() arcdb = aCTDBArc(log) pandadb = aCTDBPanda(log) config = aCTConfigARC() def getARCJobs(): return str(arcdb.getNArcJobs('TRUE')) def getARCSlots(): jobs = arcdb.getArcJobsInfo("state='Running'", ['RequestedSlots']) slots = 0 for j in jobs: slots += j['RequestedSlots']
def report(actconfs): actlogger = aCTLogger('aCTReport') logger = actlogger() rep = {} rtot = {} log = '' states = [ "sent", "starting", "running", "slots", "tovalidate", "toresubmit", "toclean", "finished", "done", "failed", "donefailed", "tobekilled", "cancelled", "donecancelled" ] for conf in actconfs: if conf: os.environ['ACTCONFIGARC'] = conf db = aCTDBPanda(logger) c = db.db.conn.cursor() c.execute("select sitename, actpandastatus, corecount from pandajobs") rows = c.fetchall() for r in rows: site, state = (str(r[0]), str(r[1])) if r[2] is None: corecount = 1 else: corecount = int(r[2]) try: rep[site][state] += 1 if state == "running": rep[site]["slots"] += corecount except: try: rep[site][state] = 1 if state == "running": try: rep[site]["slots"] += corecount except: rep[site]["slots"] = corecount except: rep[site] = {} rep[site][state] = 1 if state == "running": rep[site]["slots"] = corecount try: rtot[state] += 1 if state == "running": rtot["slots"] += corecount except: rtot[state] = 1 if state == "running": rtot["slots"] = corecount log += f"All Panda jobs: {sum([v for k,v in rtot.items() if k != 'slots'])}\n" log += f"{'':29} {' '.join([f'{s:>9}' for s in states])}\n" for k in sorted(rep.keys()): log += f"{k:>28.28}:" for s in states: try: log += f'{rep[k][s]:>10}' except KeyError: log += f'{"-":>10}' log += '\n' log += f'{"Totals":>28}:' for s in states: try: log += f'{rtot[s]:>10}' except: log += f'{"-":>10}' log += '\n\n' if len(actconfs) == 1: log += HarvesterReport() return log
#!/usr/bin/python import classad import logging from act.arc.aCTDBArc import aCTDBArc from act.condor.aCTDBCondor import aCTDBCondor from act.common.aCTProxy import aCTProxy from act.common.aCTLogger import aCTLogger logger = aCTLogger('acttest', cluster='test') log = logger() db = aCTDBArc(log) dbcondor = aCTDBCondor(log) xrsl = '''&(executable=/bin/sleep) (arguments=100) (stdout=stdout) (rerun=2) (gmlog=gmlog) (inputfiles = ("runpilot2-wrapper.sh" "/cvmfs/atlas.cern.ch/repo/sw/PandaPilotWrapper/latest/runpilot2-wrapper.sh")) (runtimeenvironment = "ENV/PROXY") ''' cad = classad.ClassAd('''\n [\n UserLog = "/tmp/log/test.$(ClusterId).log"; \n Out = "/tmp/output/test.$(ClusterId).out"; \n Err = "/tmp/error/test.$(ClusterId).err"; \n JobUniverse = 9; \n X509UserProxy = "/tmp/x509up_u100780"; \n
self.setInputs() self.setLog() self.setGMLog() self.setOutputs() self.setPriority() self.setEnvironment() def getXrsl(self): return "&" + '\n'.join(self.xrsl.values()) if __name__ == '__main__': from act.common.aCTLogger import aCTLogger from act.common.aCTConfig import aCTConfigAPP from datetime import datetime logger = aCTLogger('test') log = logger() pandajob = "jobsetID=799&logGUID=5ba37307-e4d7-4224-82f9-ff0503622677&cmtConfig=x86_64-slc6-gcc48-opt&prodDBlocks=user.rwatari%3Auser.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_RDO2RDOFTK_v01_all1E5ev_EXT2.99328897%2Cpanda.1110091801.467362.lib._9845189&dispatchDBlockTokenForOut=NULL%2CNULL%2CNULL&destinationDBlockToken=NULL%2CNULL%2CNULL&destinationSE=NULL&realDatasets=user.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_PseduoTracking_v14_all1E5ev_EXT0%2F%2Cuser.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_PseduoTracking_v14_all1E5ev_EXT1%2F%2Cuser.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_PseduoTracking_v14_all1E5ev.log%2F&prodUserID=%2FDC%3Dch%2FDC%3Dcern%2FOU%3DOrganic+Units%2FOU%3DUsers%2FCN%3Drwatari%2FCN%3D764796%2FCN%3DRyutaro+Watari%2FCN%3Dproxy&GUID=51997D0A-850A-9044-A264-83A8986FE1C6%2C1de48e07-f37c-43e6-a343-3947342858b1&realDatasetsIn=user.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_RDO2RDOFTK_v01_all1E5ev_EXT2%2Cpanda.1110091801.467362.lib._9845189&nSent=0&cloud=ND&StatusCode=0&homepackage=AnalysisTransforms-AtlasProduction_20.7.3.7&inFiles=user.rwatari.9557718.EXT2._000016.RDO_FTK.pool.root%2Cpanda.1110091801.467362.lib._9845189.7456421499.lib.tgz&processingType=panda-client-0.5.69-jedi-athena-trf¤tPriority=814&fsize=1140292964%2C727003478&fileDestinationSE=ANALY_SiGNET_DIRECT%2CANALY_SiGNET_DIRECT%2CANALY_SiGNET_DIRECT&scopeOut=user.rwatari%2Cuser.rwatari&minRamCount=4772&jobDefinitionID=836&scopeLog=user.rwatari&transformation=http%3A%2F%2Fpandaserver.cern.ch%3A25085%2Ftrf%2Fuser%2FrunAthena-00-00-12&maxDiskCount=3167&coreCount=1&prodDBlockToken=NULL%2CNULL&transferType=NULL&destinationDblock=user.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_PseduoTracking_v14_all1E5ev_EXT0.104826316_sub0341667607%2Cuser.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_PseduoTracking_v14_all1E5ev_EXT1.104826317_sub0341667608%2Cuser.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_PseduoTracking_v14_all1E5ev.log.104826315_sub0341667610&dispatchDBlockToken=NULL%2CNULL&jobPars=-l+panda.1110091801.467362.lib._9845189.7456421499.lib.tgz+--sourceURL+https%3A%2F%2Faipanda078.cern.ch%3A25443+-r+WorkArea%2Frun%2Ffast%2F+--trf+--useLocalIO++-i+%22%5B%27user.rwatari.9557718.EXT2._000016.RDO_FTK.pool.root%27%5D%22+-o+%22%7B%27IROOT%27%3A+%5B%28%27InDetDxAOD.pool.root%27%2C+%27user.rwatari.9845189.EXT0._002324.InDetDxAOD.pool.root%27%29%2C+%28%27esd.pool.root%27%2C+%27user.rwatari.9845189.EXT1._002324.esd.pool.root%27%29%5D%7D%22++-j+%22Reco_tf.py%2520--inputRDOFile%253Duser.rwatari.9557718.EXT2._000016.RDO_FTK.pool.root%2520--outputESDFile%253Desd.pool.root%2520%2520--doAllNoise%2520False%2520--autoConfiguration%253Deverything%2520--numberOfCavernBkg%253D0%2520--postInclude%253DFTKFastSim%2FInDetDxAOD.py%2520--preExec%2520%2527rec.UserAlgs%253D%255B%2522FTKFastSim%2FFTKFastSimulation_jobOptions.py%2522%255D%253Brec.doCalo.set_Value_and_Lock%2528False%2529%253Brec.doMuon.set_Value_and_Lock%2528False%2529%253Brec.doJetMissingETTag.set_Value_and_Lock%2528False%2529%253Brec.doEgamma.set_Value_and_Lock%2528False%2529%253Brec.doMuonCombined.set_Value_and_Lock%2528False%2529%253Brec.doTau.set_Value_and_Lock%2528False%2529%253Brec.doTrigger.set_Value_and_Lock%2528False%2529%253Brec.doFTK.set_Value_and_Lock%2528True%2529%253Bfrom%2520AthenaCommon.DetFlags%2520import%2520DetFlags%253BDetFlags.all_setOn%2528%2529%253BDetFlags.FTK_setOn%2528%2529%2527%2520--maxEvents%253D-1%2520--postExec%2520r2e%253A%2520%2527ServiceMgr%252B%253DService%2528%2522BeamCondSvc%2522%2529%253BbeamCondSvc%253DServiceMgr.BeamCondSvc%253BbeamCondSvc.useDB%253DFalse%253BbeamCondSvc.posX%253D-0.0497705%253BbeamCondSvc.posY%253D1.06299%253BbeamCondSvc.posZ%253D0.0%253BbeamCondSvc.sigmaX%253D0.0251281%253BbeamCondSvc.sigmaY%253D0.0231978%253BbeamCondSvc.sigmaZ%253D0.1%253BbeamCondSvc.sigmaXY%253D-2.7745e-06%253BbeamCondSvc.tiltX%253D-1.51489e-05%253BbeamCondSvc.tiltY%253D-4.83891e-05%253B%2527%22&attemptNr=2&swRelease=Atlas-20.7.3&nucleus=NULL&maxCpuCount=0&outFiles=user.rwatari.9845189.EXT0._002324.InDetDxAOD.pool.root%2Cuser.rwatari.9845189.EXT1._002324.esd.pool.root%2Cuser.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_PseduoTracking_v14_all1E5ev.log.9845189.002324.log.tgz&ddmEndPointOut=NDGF-T1_SCRATCHDISK%2CNDGF-T1_SCRATCHDISK%2CNDGF-T1_SCRATCHDISK&scopeIn=user.rwatari%2Cpanda&PandaID=3072596651&sourceSite=NULL&dispatchDblock=NULL%2Cpanda.1110091801.467362.lib._9845189&prodSourceLabel=user&checksum=ad%3Afd1c3aac%2Cad%3A516b31b3&jobName=user.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_PseduoTracking_v14_all1E5ev%2F.3071213044&ddmEndPointIn=NDGF-T1_SCRATCHDISK%2CNDGF-T1_SCRATCHDISK&taskID=9845189&logFile=user.rwatari.1k_10mu.xm005_yp106.RDO.20161003_2_EXT0_PseduoTracking_v14_all1E5ev.log.9845189.002324.log.tgz" siteinfo = { 'schedconfig': 'ANALY_SiGNET_DIRECT', 'corecount': 1, 'truepilot': False, 'maxwalltime': 10800, 'direct_access_lan': True, 'type': 'analysis' } conf = aCTConfigAPP() pandadbjob = { 'pandajob': pandajob, 'siteName': 'ANALY_SiGNET_DIRECT', 'eventranges': None,
from act.common.aCTConfig import aCTConfigATLAS from act.common.aCTLogger import aCTLogger from act.arc.aCTDBArc import aCTDBArc from aCTDBPanda import aCTDBPanda from aCTPanda import aCTPanda import sys import time if len(sys.argv) != 2: print "Usage: python aCTHeartbeatWatchdog.py timelimit" sys.exit(1) timelimit = int(sys.argv[1]) # logger logger = aCTLogger('aCTHeartbeatWatchdog') log = logger() # config conf = aCTConfigATLAS() # database dbarc = aCTDBArc(log, conf.get(["db", "file"])) dbpanda = aCTDBPanda(log, conf.get(["db", "file"])) # Query for running jobs with theartbeat longer than timelimit seconds ago select = "sendhb=1 and " \ "pandastatus in ('sent', 'starting', 'running', 'transferring') and " \ "theartbeat != 0 and " + dbpanda.timeStampLessThan("theartbeat", timelimit) columns = ['pandaid', 'pandastatus', 'proxyid', 'sitename', 'theartbeat'] jobs = dbpanda.getJobs(select, columns) if jobs: