def collect(self): queued_arc_jobs = GaugeMetricFamily('arc_queued_jobs', 'Queued jobs per ARC CE', labels=['ce_endpoint']) running_arc_jobs = GaugeMetricFamily('arc_running_jobs', 'Running jobs per ARC CE', labels=['ce_endpoint']) finishing_arc_jobs = GaugeMetricFamily('arc_finishing_jobs', 'Finishing jobs per ARC CE', labels=['ce_endpoint']) db = aCTDBArc(self.log) jobs = db.getGroupedJobs('cluster, arcstate') for job in jobs: count, cluster, state = (job['count(*)'], job['cluster'] or 'None', job['arcstate']) if state == 'submitted': queued_arc_jobs.add_metric([cluster], count) if state == 'running': running_arc_jobs.add_metric([cluster], count) if state == 'finishing': finishing_arc_jobs.add_metric([cluster], count) yield queued_arc_jobs yield running_arc_jobs yield finishing_arc_jobs yield from self.app_collect()
def CondorJobReport(self): rep = {} rtot = {} condorjobstatemap = ['Undefined', # used before real state is known 'Idle', 'Running', 'Removed', 'Completed', 'Held', 'Transferring', 'Suspended'] for conf in self.actconfs: if conf: os.environ['ACTCONFIGARC'] = conf db=aCTDBArc.aCTDBArc(self.actlog) c = db.db.conn.cursor() c.execute("select cluster, JobStatus from condorjobs") rows = c.fetchall() for r in rows: cl = str(r[0]) if not cl: cl = 'WaitingSubmission' jid = r[1] try: rep[cl][jid]+=1 except: try: rep[cl][jid]=1 except: rep[cl]={} rep[cl][jid]=1 try: rtot[jid]+=1 except: rtot[jid]=1 self.log(f"All Condor jobs: {sum(rtot.values())}") self.log(f"{'':39} {' '.join([f'{s:>9}' for s in condorjobstatemap])}") for k in sorted(rep, key=lambda x: x.split('.')[-1]): log=f"{k:>38.38}:" for s in range(8): try: log += f'{rep[k][s]:>10}' except KeyError: log += f'{"-":>10}' self.log(log) log = f"{'Totals':>38}:" for s in range(8): try: log += f'{rtot[s]:>10}' except: log += f'{"-":>10}' self.log(log+'\n\n')
def __init__(self, ceflavour=['ARC-CE']): # Get agent name from /path/to/aCTAgent.py self.name = os.path.basename(sys.argv[0])[:-3] # logger self.logger = aCTLogger.aCTLogger(self.name) self.log = self.logger() self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() # config self.conf = aCTConfig.aCTConfigAPP() self.arcconf = aCTConfig.aCTConfigARC() self.tmpdir = str(self.arcconf.get(['tmp', 'dir'])) # database self.dbarc = aCTDBArc.aCTDBArc(self.log) self.dbcondor = aCTDBCondor.aCTDBCondor(self.log) self.dbpanda = aCTDBPanda.aCTDBPanda(self.log) # APFMon self.apfmon = aCTAPFMon.aCTAPFMon(self.conf) # CRIC info self.flavour = ceflavour self.cricparser = aCTCRICParser.aCTCRICParser(self.log) self.sites = {} self.osmap = {} self.sitesselect = '' # start time for periodic restart self.starttime = time.time() self.log.info("Started %s", self.name)
def ArcJobReport(self): rep={} rtot={} states = ["Undefined", "Accepted", "Preparing", "Submitting", "Queuing", "Running", "Finishing", "Finished", "Hold", "Killed", "Failed", "Deleted", "Other"] for conf in self.actconfs: if conf: os.environ['ACTCONFIGARC'] = conf db=aCTDBArc.aCTDBArc(self.actlog) c=db.db.conn.cursor() c.execute("select jobid,state from arcjobs") rows=c.fetchall() for r in rows: reg=re.search('.+//([^:]+)',str(r[0])) cl="" try: cl=reg.group(1) except: cl='WaitingSubmission' jid=str(r[1]) if jid == 'None': jid="Other" try: rep[cl][jid]+=1 except: try: rep[cl][jid]=1 except: rep[cl]={} rep[cl][jid]=1 try: rtot[jid]+=1 except: rtot[jid]=1 self.log(f"All ARC jobs: {sum(rtot.values())}") self.log(f"{'':39} {' '.join([f'{s:>9}' for s in states])}") for k in sorted(rep, key=lambda x: x.split('.')[-1]): log=f"{k:>38.38}:" for s in states: try: log += f'{rep[k][s]:>10}' except KeyError: log += f'{"-":>10}' self.log(log) log = f"{'Totals':>38}:" for s in states: try: log += f'{rtot[s]:>10}' except: log += f'{"-":>10}' self.log(log+'\n\n')
def reconnectDB(self): ''' Reconnect DB ''' try: del self.db except AttributeError: # Already deleted pass self.db = aCTDBArc.aCTDBArc(self.log, self.conf.get(["db", "file"]))
def __init__(self): """Initialize object's attributes.""" self.logger = logging.getLogger(__name__) self.arcdb = aCTDBArc.aCTDBArc(self.logger) self.clidb = clientdb.ClientDB(self.logger) # TODO: if and when sites from arc config are used, move everything # that uses arc config to this class arcconf = aCTConfig.aCTConfigARC() self.tmpdir = arcconf.get(['tmp', 'dir'])
def __init__(self, logger, Interval=3600): self.interval = Interval self.conf = aCTConfig.aCTConfigARC() self.db = aCTDBArc(logger, self.conf.get(["db", "file"])) self.log = logger cred_type = arc.initializeCredentialsType( arc.initializeCredentialsType.SkipCredentials) self.uc = arc.UserConfig(cred_type) self.uc.CACertificatesDirectory( str(self.conf.get(["voms", "cacertdir"]))) self.voms_proxies = {}
def reconnectDB(self): ''' Reconnect DB ''' try: del self.dbarc del self.dbcondor except AttributeError: # Already deleted pass self.dbarc = aCTDBArc.aCTDBArc(self.log) self.dbcondor = aCTDBCondor.aCTDBCondor(self.log)
def __init__(self): self.conf = aCTConfig.aCTConfigARC() self.logger = aCTLogger.aCTLogger("aCTReport") self.log = self.logger() self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() #self.db=aCTDB.aCTDB(None,self.conf.get(["db","file"])) self.db = aCTDBArc.aCTDBArc(self.log, self.conf.get(["db", "file"])) self.pandadb = aCTDBPanda.aCTDBPanda(self.log, self.conf.get(["db", "file"]))
def bootstrap_db(): '''Set up the ARC and Condor DB tables''' logger = aCTLogger('aCTBootstrap') log = logger() dbarc = aCTDBArc(log) dbcondor = aCTDBCondor(log) print('Setting up ARC tables...') if not dbarc.createTables(): print('Error creating arc tables, see aCTBootstrap.log for details') print('Setting up Condor tables...') if not dbcondor.createTables(): print('Error creating condor tables, see aCTBootstrap.log for details')
def __init__(self): """Initialize all attributes.""" # get name, remove .py from the end self.name = os.path.basename(sys.argv[0])[:-3] self.arcconf = aCTConfig.aCTConfigARC() self.logger = aCTLogger.aCTLogger(self.name) self.log = self.logger() self.clidb = clientdb.ClientDB(self.log) self.arcdb = aCTDBArc.aCTDBArc(self.log) self.log.info('Started {}'.format(self.name))
def __init__(self, log, conf, appconf): # logger self.log = log self.actlocation = conf.get(["actlocation", "dir"]) self.logdir = conf.get(["logger", "logdir"]) # DB connection self.dbarc = aCTDBArc.aCTDBArc(self.log) self.dbcondor = aCTDBCondor.aCTDBCondor(self.log) # list of processes to run per cluster self.arcprocesses = [ 'act/arc/aCTStatus', 'act/arc/aCTFetcher', 'act/arc/aCTCleaner' ] self.condorprocesses = [ 'act/condor/aCTStatus', 'act/condor/aCTFetcher', 'act/condor/aCTCleaner' ] # submitter process self.arcsubmitter = 'act/arc/aCTSubmitter' self.condorsubmitter = 'act/condor/aCTSubmitter' # dictionary of processes:aCTProcessHandler of which to run a single instance self.processes_single = {'act/common/aCTProxyHandler': None} apps = appconf.getList(["modules", "app"]) for app in apps: try: ap = importlib.import_module(app).app_processes self.processes_single.update( {f'{app.replace(".", "/")}/{p}': None for p in ap}) except ModuleNotFoundError as e: self.log.critical(f'No such module {app}') raise e except AttributeError: self.log.info(f'No app-specific processes found in {app}') else: self.log.info(f'Loaded {", ".join(ap)} processes from {app}') # dictionary of cluster to list of aCTProcessHandlers self.running = {} # dictionary of cluster to Submitter processes handlers, there should # be one per unique cluster in clusterlist self.submitters = {} # Start single instance processes for process in self.processes_single: proc = self.aCTProcessHandler(process, self.logdir, actlocation=self.actlocation) proc.start() self.processes_single[process] = proc
def __init__(self, args): self.output = "" self.outfile = args.web self.actconfs = args.conffiles or [''] # empty string for default behaviour self.logger=aCTLogger.aCTLogger("aCTReport") self.actlog=self.logger() self.actlog.logger.setLevel(logging.INFO) self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() if self.outfile: self.log('<META HTTP-EQUIV="refresh" CONTENT="60"><pre>') self.log(time.asctime() + '\n') self.db=aCTDBArc.aCTDBArc(self.actlog)
def main(): if len(sys.argv) != 2: print("Usage: python aCTHeartbeatWatchdog.py timelimit") sys.exit(1) timelimit = int(sys.argv[1]) # logger logger = aCTLogger('aCTHeartbeatWatchdog') log = logger() # database dbarc = aCTDBArc(log) dbpanda = aCTDBPanda(log) # Query for running jobs with theartbeat longer than timelimit seconds ago select = "sendhb=1 and " \ "pandastatus in ('sent', 'starting', 'running', 'transferring') and " \ "theartbeat != 0 and " + dbpanda.timeStampLessThan("theartbeat", timelimit) columns = ['pandaid', 'pandastatus', 'proxyid', 'sitename', 'theartbeat'] jobs = dbpanda.getJobs(select, columns) if jobs: print( 'Found %d jobs with outdated heartbeat (older than %d seconds):\n' % (len(jobs), timelimit)) print('\t'.join( ['pandaid', 'site', 'status', 'theartbeat', 'Panda response'])) # Panda server for each proxy pandas = {} for job in jobs: proxyid = job['proxyid'] if proxyid not in pandas: panda = aCTPanda(log, dbarc.getProxyPath(proxyid)) pandas[proxyid] = panda response = pandas[proxyid].updateStatus(job['pandaid'], job['pandastatus']) print('\t'.join([ str(job['pandaid']), job['sitename'], job['pandastatus'], str(job['theartbeat']), str(response) ])) # update heartbeat time in the DB dbpanda.updateJob( job['pandaid'], {'theartbeat': dbpanda.getTimeStamp(time.time() + 1)})
def bootstrap_db(): '''Set up the DB tables''' # TODO: setup only what is needed based on config and app logger = aCTLogger('aCTBootstrap') log = logger() dbarc = aCTDBArc(log) dbclient = ClientDB(log) dbcondor = aCTDBCondor(log) dbpanda = aCTDBPanda(log) if not dbarc.createTables(): print('Error creating arc tables, see aCTBootstrap.log for details') if not dbclient.createTables(): print('Error creating client tables, see aCTBootstrap.log for details') if not dbcondor.createTables(): print('Error creating condor tables, see aCTBootstrap.log for details') if not dbpanda.createTables(): print('Error creating panda tables, see aCTBootstrap.log for details')
def __init__(self): # Get agent name from /path/to/aCTAgent.py self.name = os.path.basename(sys.argv[0])[:-3] self.cluster = '' clusterhost = '' if len(sys.argv) == 2: self.cluster = sys.argv[1] url = urlparse(self.cluster) clusterhost = url.netloc.split(':')[0] if url.netloc else url.path # logger logname = '%s-%s' % (self.name, clusterhost) if clusterhost else self.name self.logger = aCTLogger.aCTLogger(logname, cluster=self.cluster) self.log = self.logger() self.criticallogger = aCTLogger.aCTLogger('aCTCritical', cluster=self.cluster, arclog=False) self.criticallog = self.criticallogger() # config self.conf = aCTConfig.aCTConfigARC() self.tmpdir = str(self.conf.get(['tmp', 'dir'])) # database # TODO: subclasses for arc and condor with respective DBs defined there self.db = aCTDBArc.aCTDBArc(self.log) self.dbcondor = aCTDBCondor(self.log) # ARC Configuration # Credentials will be set by ARC agents for each job or set of jobs # but for now set default credential in config to keep ARC happy cred_type = arc.initializeCredentialsType( arc.initializeCredentialsType.SkipCredentials) self.uc = arc.UserConfig(cred_type) self.uc.ProxyPath(str(self.conf.get(['voms', 'proxypath']))) self.uc.CACertificatesDirectory( str(self.conf.get(["voms", "cacertdir"]))) timeout = int(self.conf.get(['atlasgiis', 'timeout'])) self.uc.Timeout(timeout) # start time for periodic restart self.starttime = time.time() self.log.info("Started %s for cluster %s", self.name, self.cluster)
def __init__(self): # Get agent name from /path/to/aCTAgent.py self.name = os.path.basename(sys.argv[0])[:-3] # logger self.logger = aCTLogger.aCTLogger(self.name) self.log = self.logger() self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() # config self.conf = aCTConfig.aCTConfigATLAS() self.arcconf = aCTConfig.aCTConfigARC() # database self.dbarc = aCTDBArc.aCTDBArc(self.log, self.conf.get(["db", "file"])) self.dbpanda = aCTDBPanda.aCTDBPanda(self.log, self.conf.get(["db", "file"])) # start time for periodic restart self.starttime = time.time() self.log.info("Started %s", self.name)
def __init__(self, log, conf): # logger self.log = log self.conf = conf self.actlocation = conf.get(["actlocation", "dir"]) self.logdir = self.conf.get(["logger", "logdir"]) # DB connection self.db = aCTDBArc.aCTDBArc(self.log, self.conf.get(["db", "file"])) # list of processes to run per cluster self.processes = [ 'act/arc/aCTStatus', 'act/arc/aCTFetcher', 'act/arc/aCTCleaner' ] # submitter process self.submitter = 'act/arc/aCTSubmitter' # dictionary of processes:aCTProcessHandler of which to run a single instance # TODO: app-specific processes in conf file instead of hard-coded self.processes_single = { 'act/atlas/aCTAutopilot': None, 'act/atlas/aCTPandaGetJobs': None, 'act/atlas/aCTPanda2Arc': None, 'act/common/aCTProxyHandler': None, 'act/atlas/aCTATLASStatus': None, 'act/atlas/aCTValidator': None, 'act/atlas/aCTAGISFetcher': None } # dictionary of cluster to list of aCTProcessHandlers self.running = {} # dictionary of cluster to Submitter processes handlers, there should # be one per unique cluster in clusterlist self.submitters = {} # Start single instance processes for process in self.processes_single: proc = self.aCTProcessHandler(process, self.logdir, actlocation=self.actlocation) proc.start() self.processes_single[process] = proc
def __init__(self): # Get agent name from /path/to/aCTAgent.py self.name = os.path.basename(sys.argv[0])[:-3] # logger self.logger = aCTLogger.aCTLogger(self.name) self.log = self.logger() self.criticallogger = aCTLogger.aCTLogger('aCTCritical', arclog=False) self.criticallog = self.criticallogger() # config self.conf = aCTConfig.aCTConfigAPP() self.arcconf = aCTConfig.aCTConfigARC() self.tmpdir = str(self.arcconf.get(['tmp', 'dir'])) # database self.dbarc = aCTDBArc.aCTDBArc(self.log) self.dbldmx = aCTDBLDMX.aCTDBLDMX(self.log) # Rucio client self.rucio = Client() # start time for periodic restart self.starttime = time.time() self.log.info("Started %s", self.name)
def PandaReport(self): rep={} rtot={} states = ["sent", "starting", "running", "slots", "tovalidate", "toresubmit", "toclean", "finished", "done", "failed", "donefailed", "tobekilled", "cancelled", "donecancelled"] for conf in self.actconfs: if conf: os.environ['ACTCONFIGARC'] = conf db=aCTDBArc.aCTDBArc(self.actlog) c=db.db.conn.cursor() c.execute("select sitename, actpandastatus, corecount from pandajobs") rows=c.fetchall() for r in rows: site, state = (str(r[0]), str(r[1])) if r[2] is None: corecount=1 else: corecount=int(r[2]) try: rep[site][state]+=1 if state == "running": rep[site]["slots"]+=1*corecount except: try: rep[site][state]=1 if state == "running": try: rep[site]["slots"]+=1*corecount except: rep[site]["slots"]=corecount except: rep[site]={} rep[site][state]=1 if state == "running": rep[site]["slots"]=corecount try: rtot[state]+=1 if state == "running": rtot["slots"]+=1*corecount except: rtot[state]=1 if state == "running": rtot["slots"]=corecount self.log(f"All Panda jobs: {sum([v for k,v in rtot.items() if k != 'slots'])}") self.log(f"{'':29} {' '.join([f'{s:>9}' for s in states])}") for k in sorted(rep.keys()): log=f"{k:>28.28}:" for s in states: try: log += f'{rep[k][s]:>10}' except KeyError: log += f'{"-":>10}' self.log(log) log = f'{"Totals":>28}:' for s in states: try: log += f'{rtot[s]:>10}' except: log += f'{"-":>10}' self.log(log+'\n\n')
#!/usr/bin/python import logging from act.arc.aCTDBArc import aCTDBArc from act.common.aCTProxy import aCTProxy from act.common.aCTLogger import aCTLogger logger = aCTLogger('acttest', cluster='test') log = logger() db = aCTDBArc(log, "act") xrsl = '''&(executable=/bin/sleep) (arguments=1) (stdout=stdout) (rerun=2) (gmlog=gmlog) (inputfiles = (file1 "srm://srm.ndgf.org:8443;cache=no/atlas/disk/atlasdatadisk/rucio/mc15_13TeV/fe/a0/AOD.07849074._019904.pool.root.1")) ''' p = aCTProxy(logging.getLogger(), 1) voms = "atlas" attribute = "" # e.g. attribute="/atlas/Role=production" proxypath = p.conf.get(["voms", "proxypath"]) validHours = 5 proxyid = 1 # p.createVOMSAttribute(voms, attribute, proxypath, validHours) db.insertArcJobDescription(xrsl, clusterlist='gsiftp://pcoslo5.cern.ch/fork', proxyid=proxyid, maxattempts=5)
import time from datetime import datetime from act.arc.aCTDBArc import aCTDBArc from act.atlas.aCTDBPanda import aCTDBPanda from act.common.aCTLogger import aCTLogger from act.common.aCTConfig import aCTConfigARC try: service_id, webpage_url = sys.argv[1:3] except: print('Usage: kibana.py service_id webpage_url') sys.exit(1) logger = aCTLogger('kibana probe') log = logger() arcdb = aCTDBArc(log) pandadb = aCTDBPanda(log) config = aCTConfigARC() def getARCJobs(): return str(arcdb.getNArcJobs('TRUE')) def getARCSlots(): jobs = arcdb.getArcJobsInfo("state='Running'", ['RequestedSlots']) slots = 0 for j in jobs: slots += j['RequestedSlots'] return str(slots)
def __init__(self): """Initialize object.""" self.logger = logging.getLogger(__name__) self.actproxy = aCTProxy.aCTProxy(self.logger) self.arcdb = aCTDBArc.aCTDBArc(self.logger)
import sys import time if len(sys.argv) != 2: print "Usage: python aCTHeartbeatWatchdog.py timelimit" sys.exit(1) timelimit = int(sys.argv[1]) # logger logger = aCTLogger('aCTHeartbeatWatchdog') log = logger() # config conf = aCTConfigATLAS() # database dbarc = aCTDBArc(log, conf.get(["db", "file"])) dbpanda = aCTDBPanda(log, conf.get(["db", "file"])) # Query for running jobs with theartbeat longer than timelimit seconds ago select = "sendhb=1 and " \ "pandastatus in ('sent', 'starting', 'running', 'transferring') and " \ "theartbeat != 0 and " + dbpanda.timeStampLessThan("theartbeat", timelimit) columns = ['pandaid', 'pandastatus', 'proxyid', 'sitename', 'theartbeat'] jobs = dbpanda.getJobs(select, columns) if jobs: print 'Found %d jobs with outdated heartbeat (older than %d seconds):\n' % ( len(jobs), timelimit) print '\t'.join( ['pandaid', 'site', 'status', 'theartbeat', 'Panda response'])
import mysql.connector import logging import act.common.aCTConfig as aCTConfig import act.client.clientdb as clientdb import act.arc.aCTDBArc as aCTDBArc # get db connection info conf = aCTConfig.aCTConfigARC() socket = conf.get(['db', 'socket']) dbname = conf.get(['db', 'name']) # connect to mysql print('Connecting to mysql ...') conn = mysql.connector.connect(unix_socket=socket) # create database if it doesn't exist print('Creating database {} ...'.format(dbname)) cursor = conn.cursor() cursor.execute("CREATE DATABASE IF NOT EXISTS {}".format(dbname)) conn.commit() # create tables for in database print('Creating aCT tables ...') clidb = clientdb.ClientDB() arcdb = aCTDBArc.aCTDBArc(logging.getLogger(__name__)) clidb.createTables() arcdb.createTables() conn.commit()