Ejemplo n.º 1
0
es = Elasticsearch(hosts=[{
    'host': config['ES_HOST'],
    'port': 9200,
    'scheme': 'https'
}],
                   http_auth=(config['ES_USER'], config['ES_PASS']),
                   request_timeout=60)

if es.ping():
    print('connected to ES.')
else:
    print('no connection to ES.')
    sys.exit(1)

ALARM = alarms('Virtual Placement', 'XCache', 'dead server')

ct = datetime.now()
currentTime = int(round(datetime.now().timestamp() * 1000))
lastHours = 0.5
startTime = currentTime - int(lastHours * 3600000)
print('start time', startTime)
print('current time', datetime.now())

# vp_liveness

liveness_query = {
    "size": 100,
    "query": {
        "bool": {
            "must": [{
Ejemplo n.º 2
0
user = []
tkids = {}
users = {}
for i in range(results):
    tkid.append(res['hits']['hits'][i]['_source']['taskid'])
    user.append(res['hits']['hits'][i]['_source']['dn'])

for i in range(results):
    if len(tkid) > 0:
        count = tkid.count(tkid[i])
        value = tkid[i]
        for j in range(count):
            tkid.remove(value)
        tkids[value] = count

for i in range(results):
    if len(user) > 0:
        count = user.count(user[i])
        value = user[i]
        for j in range(count):
            user.remove(value)
        users[value] = count

if len(tkids) > 0:
    ALARM = alarms('Analytics', 'Frontier', 'Bad SQL queries')
    ALARM.addAlarm(body='Bad SQL queries',
                   source={
                       'users': list(users),
                       'tkids': list(tkids)
                   })
Ejemplo n.º 3
0
with open(config_path) as json_data:
    config = json.load(json_data, )

print('current time', datetime.now())

res = requests.get(
    'http://graphite.mwt2.org/render?target=dcache.xrootd.*&format=json&from=now-2min'
)
if (res.status_code == 200):
    data = res.json()
    print(data)
    print('recieved data on {} servers'.format(len(data)))
else:
    print('problem in receiving connections!')

ALARM = alarms('Virtual Placement', 'XCache', 'large number of connections')

for server in data:
    serverIP = server['target'].replace('dcache.xrootd.', '').replace('_', '.')
    connections = server['datapoints'][-1][0]
    timestamp = server['datapoints'][-1][1]
    timestamp = datetime.fromtimestamp(timestamp)
    timestamp = timestamp.strftime("%Y-%m-%d %H:%M:%S")
    if not connections:
        print('n connections not retrieved... skipping.')
        continue
    if connections < 200:
        print('server {} has {} connections.'.format(serverIP, connections))
    else:
        source = {
            "xcache": serverIP,
    if dis > 0:
        mes += str(dis) + " disconnected "
    if pre > 0:
        mes += str(pre) + " unprocessed "
    frontiersrvr[r['key']] = mes + 'queries.'
    frsrvs.append(r['key'])

print('problematic servers:', frontiersrvr)

# ### Any non-zero value for any Frontier server triggers the alert
#
# The alert contains every Frontier server with failed queries and which kind of failures happened.

if len(frontiersrvr) > 0 or len(taskid) > 0:

    ALARM = alarms('Analytics', 'Frontier', 'Failed queries')
    ALARM.addAlarm(body='Failed Frontier queries',
                   tags=frsrvs,
                   source={
                       'servers': frontiersrvr,
                       'tasks': list(taskid)
                   })

    #   body += '\tthis mail is to let you know that in the past ' + \
    #        str(nhours) + ' hours \n'
    #    if len(frontiersrvr) > 0:
    #         body += '\tthe following servers present failed queries: \n'
    #         body += '\t(attached numbers correspond to rejected, disconnected and unprocessed queries) \n\n'
    #         for fkey in frontiersrvr:
    #             body += fkey
    #             body += ' : '
# Check Cluster State
# ====
# This notebook check the state of ES cluster and if needed creates alarm.
# It is run once per hour from a cron job.

import sys
from alerts import alarms
import requests

import json
with open('/config/config.json') as json_data:
    config = json.load(json_data, )

ES_CONN = 'https://' + config['ES_USER'] + ':' + config['ES_PASS'] + \
    '@' + config['ES_HOST'] + ':9200/_cluster/health'
r = requests.get(ES_CONN)
res = r.json()
print(res)

if res['status'] == 'green':
    sys.exit(0)

ALARM = alarms('Analytics', 'Elasticsearch', 'status')

if res['status'] == 'red':
    ALARM.addAlarm(body='Alert on Elastic cluster state [ES in red]',
                   tags=['red'])
if res['status'] == 'yellow' and res['unassigned_shards'] > 10:
    ALARM.addAlarm(body='Alert on Elastic cluster state [ES in yellow]',
                   tags=['yellow'])
Ejemplo n.º 6
0
es = Elasticsearch(hosts=[{
    'host': config['ES_HOST'],
    'port': 9200,
    'scheme': 'https'
}],
                   http_auth=(config['ES_USER'], config['ES_PASS']),
                   request_timeout=60)

if es.ping():
    print('connected to ES.')
else:
    print('no connection to ES.')
    sys.exit(1)

ALARM = alarms('Analytics', 'WFMS', 'indexing')

ct = datetime.now()
currentTime = int(round(datetime.now().timestamp() * 1000))
lastHours = 7
startTime = currentTime - lastHours * 3600000
print('start time', startTime)
print('current time', datetime.now())

# JOBS

jobs_query = {
    "query": {
        "range": {
            "modificationtime": {
                "gte": startTime,
Ejemplo n.º 7
0
    # ['Timestamp', 'Group', 'Sites', 'Host', 'Ip', 'Bandwidth', 'BandwidthRate', 'Hits', 'HitsRate']
    for line in lines[1:]:
        vals = line.split('\t')
        if datetime.fromtimestamp(int(vals[0])).hour != current_hour:
            continue
        site = vals[2]
        if site in sites:
            failovers[site]['servers'] += 1
            failovers[site]['requests'] += int(vals[7])
            failovers[site]['data'] += int(vals[5])

    print('failovers:', failovers)
else:
    print('problem in receiving connections!')

ALARM = alarms('SLATE', 'Squid', 'failovers')

for site, details in failovers.items():
    if details['servers'] == 0:
        continue
    source = {
        "site": site,
        "WNs": details['servers'],
        "requests": details['requests'],
        "data": details['data']
    }
    print(source)
    ALARM.addAlarm(body='failover', tags=[site], source=source)

print('Done.')
Ejemplo n.º 8
0
frontiersrvr = {}
res = res['aggregations']['servers']['buckets']
for r in res:
    print(r)
    if r['maxthreads']['value'] > threadlimit:
        frontiersrvr[r['key']] = r['maxthreads']['value']

print('problematic servers:', frontiersrvr)

# ### Submit alert if there are any servers showing a high number of simultaneous threads
#
# The number associated to each Frontier server is the highest number recorded during
# the given interval

if len(frontiersrvr) > 0:
    ALARM = alarms('Analytics', 'Frontier', 'Too many threads')
    ALARM.addAlarm(body='Failed Frontier queries',
                   tags=frontiersrvr,
                   source={'servers': frontiersrvr})
    # test_name = 'Too many concurrent threads'
    # body += '\tthis mail is to let you know that the number of simultaneous threads went beyond '
    # body += str(threadlimit) + ' on some servers \n\n'
    # for fkey in frontiersrvr:
    #     body += fkey
    #     body += ' : '
    #     body += str(frontiersrvr[fkey])
    #     body += '\n'
    # body += '\nBest regards,\nATLAS AAS'
    # body += '\n\n To change your alerts preferences please use the following link:\n' + user.link
    # A.sendGunMail(test_name, user.email, body)