es = Elasticsearch(hosts=[{ 'host': config['ES_HOST'], 'port': 9200, 'scheme': 'https' }], http_auth=(config['ES_USER'], config['ES_PASS']), request_timeout=60) if es.ping(): print('connected to ES.') else: print('no connection to ES.') sys.exit(1) ALARM = alarms('Virtual Placement', 'XCache', 'dead server') ct = datetime.now() currentTime = int(round(datetime.now().timestamp() * 1000)) lastHours = 0.5 startTime = currentTime - int(lastHours * 3600000) print('start time', startTime) print('current time', datetime.now()) # vp_liveness liveness_query = { "size": 100, "query": { "bool": { "must": [{
user = [] tkids = {} users = {} for i in range(results): tkid.append(res['hits']['hits'][i]['_source']['taskid']) user.append(res['hits']['hits'][i]['_source']['dn']) for i in range(results): if len(tkid) > 0: count = tkid.count(tkid[i]) value = tkid[i] for j in range(count): tkid.remove(value) tkids[value] = count for i in range(results): if len(user) > 0: count = user.count(user[i]) value = user[i] for j in range(count): user.remove(value) users[value] = count if len(tkids) > 0: ALARM = alarms('Analytics', 'Frontier', 'Bad SQL queries') ALARM.addAlarm(body='Bad SQL queries', source={ 'users': list(users), 'tkids': list(tkids) })
with open(config_path) as json_data: config = json.load(json_data, ) print('current time', datetime.now()) res = requests.get( 'http://graphite.mwt2.org/render?target=dcache.xrootd.*&format=json&from=now-2min' ) if (res.status_code == 200): data = res.json() print(data) print('recieved data on {} servers'.format(len(data))) else: print('problem in receiving connections!') ALARM = alarms('Virtual Placement', 'XCache', 'large number of connections') for server in data: serverIP = server['target'].replace('dcache.xrootd.', '').replace('_', '.') connections = server['datapoints'][-1][0] timestamp = server['datapoints'][-1][1] timestamp = datetime.fromtimestamp(timestamp) timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S") if not connections: print('n connections not retrieved... skipping.') continue if connections < 200: print('server {} has {} connections.'.format(serverIP, connections)) else: source = { "xcache": serverIP,
if dis > 0: mes += str(dis) + " disconnected " if pre > 0: mes += str(pre) + " unprocessed " frontiersrvr[r['key']] = mes + 'queries.' frsrvs.append(r['key']) print('problematic servers:', frontiersrvr) # ### Any non-zero value for any Frontier server triggers the alert # # The alert contains every Frontier server with failed queries and which kind of failures happened. if len(frontiersrvr) > 0 or len(taskid) > 0: ALARM = alarms('Analytics', 'Frontier', 'Failed queries') ALARM.addAlarm(body='Failed Frontier queries', tags=frsrvs, source={ 'servers': frontiersrvr, 'tasks': list(taskid) }) # body += '\tthis mail is to let you know that in the past ' + \ # str(nhours) + ' hours \n' # if len(frontiersrvr) > 0: # body += '\tthe following servers present failed queries: \n' # body += '\t(attached numbers correspond to rejected, disconnected and unprocessed queries) \n\n' # for fkey in frontiersrvr: # body += fkey # body += ' : '
# Check Cluster State # ==== # This notebook check the state of ES cluster and if needed creates alarm. # It is run once per hour from a cron job. import sys from alerts import alarms import requests import json with open('/config/config.json') as json_data: config = json.load(json_data, ) ES_CONN = 'https://' + config['ES_USER'] + ':' + config['ES_PASS'] + \ '@' + config['ES_HOST'] + ':9200/_cluster/health' r = requests.get(ES_CONN) res = r.json() print(res) if res['status'] == 'green': sys.exit(0) ALARM = alarms('Analytics', 'Elasticsearch', 'status') if res['status'] == 'red': ALARM.addAlarm(body='Alert on Elastic cluster state [ES in red]', tags=['red']) if res['status'] == 'yellow' and res['unassigned_shards'] > 10: ALARM.addAlarm(body='Alert on Elastic cluster state [ES in yellow]', tags=['yellow'])
es = Elasticsearch(hosts=[{ 'host': config['ES_HOST'], 'port': 9200, 'scheme': 'https' }], http_auth=(config['ES_USER'], config['ES_PASS']), request_timeout=60) if es.ping(): print('connected to ES.') else: print('no connection to ES.') sys.exit(1) ALARM = alarms('Analytics', 'WFMS', 'indexing') ct = datetime.now() currentTime = int(round(datetime.now().timestamp() * 1000)) lastHours = 7 startTime = currentTime - lastHours * 3600000 print('start time', startTime) print('current time', datetime.now()) # JOBS jobs_query = { "query": { "range": { "modificationtime": { "gte": startTime,
# ['Timestamp', 'Group', 'Sites', 'Host', 'Ip', 'Bandwidth', 'BandwidthRate', 'Hits', 'HitsRate'] for line in lines[1:]: vals = line.split('\t') if datetime.fromtimestamp(int(vals[0])).hour != current_hour: continue site = vals[2] if site in sites: failovers[site]['servers'] += 1 failovers[site]['requests'] += int(vals[7]) failovers[site]['data'] += int(vals[5]) print('failovers:', failovers) else: print('problem in receiving connections!') ALARM = alarms('SLATE', 'Squid', 'failovers') for site, details in failovers.items(): if details['servers'] == 0: continue source = { "site": site, "WNs": details['servers'], "requests": details['requests'], "data": details['data'] } print(source) ALARM.addAlarm(body='failover', tags=[site], source=source) print('Done.')
frontiersrvr = {} res = res['aggregations']['servers']['buckets'] for r in res: print(r) if r['maxthreads']['value'] > threadlimit: frontiersrvr[r['key']] = r['maxthreads']['value'] print('problematic servers:', frontiersrvr) # ### Submit alert if there are any servers showing a high number of simultaneous threads # # The number associated to each Frontier server is the highest number recorded during # the given interval if len(frontiersrvr) > 0: ALARM = alarms('Analytics', 'Frontier', 'Too many threads') ALARM.addAlarm(body='Failed Frontier queries', tags=frontiersrvr, source={'servers': frontiersrvr}) # test_name = 'Too many concurrent threads' # body += '\tthis mail is to let you know that the number of simultaneous threads went beyond ' # body += str(threadlimit) + ' on some servers \n\n' # for fkey in frontiersrvr: # body += fkey # body += ' : ' # body += str(frontiersrvr[fkey]) # body += '\n' # body += '\nBest regards,\nATLAS AAS' # body += '\n\n To change your alerts preferences please use the following link:\n' + user.link # A.sendGunMail(test_name, user.email, body)