def __init__(self, **conf): # save yaml conf. self.conf = conf # redis handler self.ranger = MagicShow(**conf['database']) self.alertr = RedisClient(db='alert', **conf['database']) self.hostr = RedisClient(db='hosts', **conf['database']) # blacklist manager. self.bm = BlackManager(**conf['database']) self.alarm_info = dict() # alarm times self.alarm_times = dict() self.conf_info = dict() self.data_info = dict() self.time_stamp = dict() self.time_stamp['check_time'] = dict() self.time_stamp['alarm_time'] = dict() # which ones are being checked self.alarm_check = dict() # self.alive = True # self.check_enqueue = multiprocessing.Queue() self.check_dequeue = multiprocessing.Queue() self.alarm_enqueue = multiprocessing.Queue() self.alarm_dequeue = multiprocessing.Queue() # self.procs = dict() # create xmpp object self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database']) self.xmpp.start() self.xmpp.enter_room()
def __init__(self, add_queue, del_queue, **conf): multiprocessing.Process.__init__(self) self.conf = conf self.ranger = MagicShow(**self.conf['database']) self.alertr = RedisClient(db='alert', **self.conf['database']) # save all servers of this process. self.total_server = list() # queue for add new servers. self.add_queue = add_queue # queue for del deleted servers. self.del_queue = del_queue # vars for loop. self.alive = True # vars for saving client data. self.data_info = dict() # vars for saveing socket. self.connections = dict() # save fd's sockert. self.filenos = dict() # save some timestamp. self.timestamps = dict() # save new coming data. self.requests = dict() # epoll object. self.epoll = select.epoll() # save each host's configs. self.host_configs = dict()
def __init__(self, **conf): multiprocessing.Process.__init__(self) self.conf = conf self.ranger = MagicShow(**self.conf['database']) self.alertr = RedisClient(db='alert', **self.conf['database']) # save http config. self.tcp_config = dict() self.record = dict() self.alive = True # save save conf time and save data time. self.time_stamp = dict() # save each item check time. self.check_stamp = dict() # self.check_dequeue = multiprocessing.Queue() self.check_enqueue = multiprocessing.Queue()
class HTTPMonitor(multiprocessing.Process): def __init__(self, **conf): multiprocessing.Process.__init__(self) self.conf = conf self.ranger = MagicShow(**self.conf['database']) self.alertr = RedisClient(db='alert', **self.conf['database']) # save http config. self.http_config = dict() self.record = dict() self.alive = True # save save conf time and save data time. self.time_stamp = dict() # save each item check time. self.check_stamp = dict() # self.check_dequeue = multiprocessing.Queue() self.check_enqueue = multiprocessing.Queue() def get_conf(self): # get all configs about client configs = self.alertr.hgetall('total:http:collector:configs') find_host = dict() final_data = dict() data = dict() self.http_config = dict() for item, conf in configs.items(): try: conf = json.loads(conf) except Exception, e: continue target = conf.pop('target') # cache expr which already select in redis. if target not in find_host: hosts = find_host[target] = self.list_servers(target) else: hosts = find_host[target] conf['target'] = hosts self.http_config[item] = conf
html_part.set_charset('utf-8') msg.attach(html_part) try: smtp = smtplib.SMTP() smtp.connect("mx.hy01.wandoujia.com",25) smtp.sendmail('*****@*****.**', email, msg.as_string()) except Exception,e: print e options = OptConf() opt = options.opt args = options.args conf = options.conf alertr = RedisClient(db="alert", **conf["database"]) alarm = alertr.hgetall("result:system:disk_pct_use") data = list() addrs = [ '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**' ] addr = ','.join(addrs)
def __init__(self,**conf): self.blackr = RedisClient(db='black', **conf) self.ranger = MagicShow(**conf)
class BlackManager(object): def __init__(self,**conf): self.blackr = RedisClient(db='black', **conf) self.ranger = MagicShow(**conf) def addbl(self,data): expire = data['expire'] hosts = self.ranger.load(data['range']).show() if hosts == 'all': key = 'black_%s_%s' % (data['policy'],host) self.blackr.set(key,'black',ex=expire) else: for host in hosts: key = 'black_%s_%s' % (data['policy'],host) self.blackr.set(key,'black',ex=expire) def delbl(self,data): print data policy = data['policy'] hosts = self.ranger.load(data['range']).show() for host in hosts: if policy != 'all': key = 'black_%s_%s' % (policy,host) self.blackr.delete(key) else: keys = self.blackr.keys('black_*_%s' % host) for key in keys: self.blackr.delete(key) def check(self,host,policy=None): keys = list() key = 'black_all_%s' % (host) keys.append(key) if policy: keys.append('black_%s_%s' % (policy,host)) keys.append('black_%s_all' % (policy)) for key in keys: if self.blackr.exists(key): return True return False def show(self): keys = self.blackr.keys('black*') for k in keys: try: typer,policy,host = k.split('_') ttl = self.blackr.pttl(k) except: print "=================", k else: print "策略:%s, 主机:%s, 剩余时间:%s 秒" % (policy,host,int(ttl/1000))
html_part.set_charset('utf-8') msg.attach(html_part) try: smtp = smtplib.SMTP() smtp.connect("mx.hy01.wandoujia.com",25) smtp.sendmail('*****@*****.**', email, msg.as_string()) except Exception,e: print e options = OptConf() opt = options.opt args = options.args conf = options.conf alertr = RedisClient(db='alert', **conf['database']) print "redis connect stat: %s" % alertr.ping() alarm = alertr.hgetall('total:alarm:errors') data = list() addrs = [ '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**' ] addr = ','.join(addrs)
from time import sleep, time LIBPATH = os.path.dirname( os.path.abspath(__file__) )+ '/../' + 'lib' sys.path.append( LIBPATH ) from Utils import OptConf, Dumper, RedisClient from Utils import BreakHere from Radix import MagicShow from Radix import CharSet parm = { 'fname=s' : 'a conf info data file, yaml format' } options = OptConf( **parm ) opt = options.opt args = options.args conf = options.conf if 'fname' not in opt: options.help() alertr = RedisClient(db='alert', **conf['database']) print "redis connect stat: %s" % alertr.ping() pfile = opt['fname'] data = yaml.load(open(pfile)) groups = alertr.keys('group:*') old_data = dict() for group in groups: try: v = alertr.get(group) except: pass old_data[group] = v for p in data: name = 'group:'+p.pop('name')
from Utils import OptConf, Dumper from Utils import BreakHere, RedisClient from Radix import MagicShow from Radix import CharSet except ImportError, e: print os.path.abspath(__file__), str(e) exit(1) parm = { 'fname=s' : 'a conf info data file, yaml format' } options = OptConf( **parm ) opt = options.opt args = options.args conf = options.conf if 'fname' not in opt: options.help() alertr = RedisClient(db='alert', **conf['database']) print "redis connect stat: %s" % alertr.ping() pfile = opt['fname'] data = yaml.load(open(pfile)) dict_obj = dict() conf_type = ['client', 'http', 'tcp'] for p in data: policy_key = '' name = p['name'] collector = p['collector'] print "collector", collector
from time import sleep, time LIBPATH = os.path.dirname( os.path.abspath(__file__) )+ '/../' + 'lib' sys.path.append( LIBPATH ) from Utils import OptConf, RedisClient from Radix import MagicShow from Radix import CharSet except ImportError, e: print os.path.abspath(__file__), str(e) exit(1) parm = { 'fname=s' : 'a conf info data file, one json per line' } options = OptConf( **parm ) opt = options.opt args = options.args conf = options.conf alertr = RedisClient(db='alert', **conf['database']) print "redis connect stat: %s" % alertr.ping() data = opt['fname'] data = yaml.load(open(data)) dict_obj = dict() for i in data: key = '%s' % i['name'] type = i['type'] if re.match('SYSTEM|SCRIPT|DEFAULT', type, re.I): name = 'client' elif re.match('HTTP', type, re.I): name = 'http' elif re.match('TCP', type, re.I): name = 'tcp' else: continue
class AlarmMain(object): def __init__(self, **conf): # save yaml conf. self.conf = conf # redis handler self.ranger = MagicShow(**conf['database']) self.alertr = RedisClient(db='alert', **conf['database']) self.hostr = RedisClient(db='hosts', **conf['database']) # blacklist manager. self.bm = BlackManager(**conf['database']) self.alarm_info = dict() # alarm times self.alarm_times = dict() self.conf_info = dict() self.data_info = dict() self.time_stamp = dict() self.time_stamp['check_time'] = dict() self.time_stamp['alarm_time'] = dict() # which ones are being checked self.alarm_check = dict() # self.alive = True # self.check_enqueue = multiprocessing.Queue() self.check_dequeue = multiprocessing.Queue() self.alarm_enqueue = multiprocessing.Queue() self.alarm_dequeue = multiprocessing.Queue() # self.procs = dict() # create xmpp object self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database']) self.xmpp.start() self.xmpp.enter_room() def get_conf(self): conf_info = dict() for conf_type in ['client', 'tcp', 'http']: policy_configs = self.alertr.hgetall('total:%s:policy:configs' % conf_type) collector_configs = self.alertr.hgetall('total:%s:collector:configs' % conf_type) for item, policy_conf in policy_configs.items(): if 'collector' not in policy_conf:continue try: policy_conf = json.loads(policy_conf) except Exception as e: continue collector_name = policy_conf['collector'] target = policy_conf.pop('target') if collector_name in collector_configs: collector_conf = collector_configs[collector_name] else: continue try: conf = json.loads(collector_conf) except Exception as e: continue else: collector_interval = conf['interval'] policy_conf['interval'] = collector_interval targets = self.ranger.load(target).show() policy_conf['target'] = targets policy_conf['type'] = conf['type'] if conf_type == 'TCP' or conf_type == 'tcp': policy_conf['port'] = conf['port'] uniq_key = '%s|%s' % (item, conf_type) conf_info[uniq_key] = policy_conf if uniq_key not in self.alarm_info: self.alarm_info[uniq_key] = dict() if uniq_key not in self.alarm_times: self.alarm_times[uniq_key] = dict() return conf_info def init_process(self): cpu = multiprocessing.cpu_count() for i in xrange(cpu*2): proc = Process(target=check_data_proc, args=(self.check_enqueue, self.check_dequeue)) self.procs[proc] = 1 proc.start() for i in xrange(cpu*2): proc = Process(target=alarm_data_proc, args=(self.alarm_enqueue, self.alarm_dequeue)) self.procs[proc] = 1 proc.start() def get_data(self): data_info = dict() for item, conf in self.conf_info.items(): result_key = "result:%s" % conf['collector'] if not self.alertr.exists(result_key):continue result_data = self.alertr.hgetall(result_key) # too slow if determine, just leave to process. self.conf_info[item]['result'] = result_data def save_data(self, item, data): self.alarm_info[item] = data def send_alarm(self, item, data): email_error = dict() smess_error = dict() xmpp_error = dict() total_error = list() for k, v in data.items(): for info in v: template = info['template'] match = re.compile('\$(\w+)\$', re.I) while True: whether_match = match.search(template) if whether_match: key = whether_match.groups(0)[0] if key in info: template = template.replace('$'+key+'$', str(info[key]), 1) else: template = re.sub(',?$'+key+'$,?', '', template, 1) else:break template = re.sub("\n", "", template) real_key, host, name, groups = info['real_key'], info['host'], info['name'], info['group'] host_info, cur_time = self.hostr.hgetall(host), strftime('%H:%M:%S', localtime()) emessage = '%s|%s|%s|%s|%s'%(k, template, host_info['production'], host_info['idc'], \ host_info['service']) if host_info else '%s|%s|%s|%s|%s'%(k, template,'','','') xmpp_mess = '[%s]%s -- 机房:%s, 产品线:%s, 服务:%s 报警时间:%s'\ %(k, template, host_info['idc'], host_info['production'], host_info['service'], cur_time)\ if host_info else '[%s]%s -- 报警时间:%s' % (k, template, cur_time) message = '[%s]%s' % (k, template.replace(host, host.split('?')[0])) if 'message' in info and len(info['message']) >= 100: message = message.encode().replace(info['message'],'(output too long,read email)', 1) level = str(info['level']) readable_time = strftime('%Y-%m-%d %H:%M:%S') if self.bm.check(host,name): record = '%s|%s' % (emessage, 'is_black') total_error.append(record) continue if k == 'ALARM': record = '%s|%s' % (emessage, 'is_not_black') total_error.append(record) # didn't alarm but recovery if host not in self.alarm_times[item]: self.alarm_times[item][host] = dict() if real_key not in self.alarm_times[item][host]: self.alarm_times[item][host][real_key] = dict() if level not in self.alarm_times[item][host][real_key]: self.alarm_times[item][host][real_key][level] = 0 if self.alarm_times[item][host][real_key][level] >= info['limit'] and \ info['limit'] != 0: print "%s * message:%s, limitation:%s" % (readable_time, message, info['limit']) continue elif k == 'RECOVERY': #recovery but delete try: del self.alarm_info[item][host][real_key] except: pass try: if level in self.alarm_times[item][host][real_key]: del self.alarm_times[item][host][real_key] else: continue except: continue mobiles = list() emails = list() xmpps = list() for group in groups: group_info = self.alertr.get('group:'+group) if not group_info: break try: group_info = json.loads(group_info) except Exception as e: print "Error Exception:%s, Group:%s" % (e, group) continue mobile = group_info['mobile'] if 'mobile' in group_info else [] email = group_info['email'] if 'email' in group_info else [] xmpp = group_info['xmpp'] if 'xmpp' in group_info else [] for m in mobile: if m not in mobiles:mobiles.append(m) for e in email: if e not in emails:emails.append(e) for x in xmpp: if x not in xmpps:xmpps.append(x) print "%s * message:%s, method:%s" % (readable_time, xmpp_mess, info['method']) if self.check_alarm_time(info['period']): # if real_key in self.alarm_info means alarm again if k == "ALARM": self.alarm_times[item][host][real_key][level] += 1 method = info['method'] if 'email' in method: for e in emails: if e not in email_error: email_error[e] = dict() if k not in email_error[e]: email_error[e][k] = dict() email_error[e][k][emessage] = 1 if 'sms' in method: for m in mobiles: if m not in smess_error: smess_error[m] = dict() smess_error[m][message] = 1 if 'xmpp' in method: for x in xmpps: if x not in xmpp_error: xmpp_error[x] = dict() xmpp_error[x][xmpp_mess] = 1 #if policy deleted, delete record in total:alarm:errors if not total_error: self.alertr.hdel('total:alarm:errors', item) else: self.alertr.hset('total:alarm:errors', item, json.dumps(total_error)) if email_error: try:send_email(email_error) except Exception as e: print str(e) if smess_error: try:send_smess(smess_error) except Exception as e: print str(e) if xmpp_error: try:send_xmpp(self.xmpp, xmpp_error) except Exception as e: print str(e) def clean_memory(self): curr_items = self.alertr.hkeys('total:alarm:errors') for item in [x for x in curr_items if x not in self.alarm_info or not self.alarm_info[x]]: self.alertr.hdel('total:alarm:errors', item) # delete timstamps self.time_stamp['alarm_time'] = dict((k, v) for k, v in self.time_stamp['alarm_time'].items()\ if k in self.conf_info) self.time_stamp['check_time'] = dict((k, v) for k, v in self.time_stamp['check_time'].items()\ if k in self.conf_info) def check_alarm_time(self, period): for times in period: start,stop = times.split('-') if int(stop) == int(start): continue else: cur_hour = localtime()[3] if cur_hour <= int(stop) and int(start) <= cur_hour: return True return False def close_alarm(self, a, b): self.xmpp.disconnect() exit(1) def run(self): self.init_process() while self.alive: for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP): signal.signal(sig, self.close_alarm) # get all agent policy cur_time = int(time()) if 'conf_data' not in self.time_stamp or cur_time - self.time_stamp['conf_data'] >=15: self.conf_info = self.get_conf() self.get_data() self.time_stamp['conf_data'] = time() # put data into check queue. for item, conf in self.conf_info.items(): if item not in self.time_stamp['check_time'] or \ cur_time - self.time_stamp['check_time'][item] >= self.conf_info[item]['interval']: self.check_enqueue.put([item, conf, self.alarm_info[item]]) self.time_stamp['check_time'][item] = time() # read data from check queue. while True: try: item, data = self.check_dequeue.get(1, 0.1) except Queue.Empty: break else: self.save_data(item, data) # put data into alarm queue. for item in self.alarm_info.keys(): if item not in self.conf_info: del self.alarm_info[item] continue else: alarm_time = self.time_stamp['alarm_time'] check_time = self.time_stamp['check_time'][item] if not self.alarm_info[item]: if item in alarm_time:del alarm_time[item] continue if item not in alarm_time or cur_time - alarm_time[item] >= self.conf_info[item]['rate']: self.alarm_enqueue.put([item, self.conf_info[item], self.alarm_info[item]]) self.time_stamp['alarm_time'][item] = cur_time while True: try: item, data = self.alarm_dequeue.get(1, 0.1) except Queue.Empty: break else: # send alarm data self.send_alarm(item, data) # delete recovery errors. self.clean_memory()
def load_host(conf): dbconf = conf url = "server/api/servers" username = "******" password = "******" data_dict = {} data = Ldapapi.get_wrapper(url,username,password,data_dict) data_set = dict() i=0 for block in data: i+=1 idc = None if 'idc' not in block else block['idc'].encode('utf-8') hostname = block['hostname'].encode('utf-8') product = None if 'product' not in block or not block['product'] else \ block['product'].encode('utf-8') service = None if 'service' not in block or not block['service'] else block['service'] if isinstance(service,list): for s in service: s = s.encode('utf-8') key = '%s:%s:%s:%s' % ('server',idc,product,s) if key.lower() not in data_set: data_set[key.lower()]=list() data_set[key.lower()].append(hostname) else: key = '%s:%s:%s:%s' % ('server',idc,product,service) if key.lower() not in data_set: data_set[key.lower()]=list() data_set[key.lower()].append(hostname) print "all server total:%s" % i # check servers of the key host_r = RedisClient(db='hosts', **dbconf) range_r = RedisClient(db='range', **dbconf) server_info = dict() for key in data_set: new_server = data_set[key] if range_r.exists(key): old_servers = range_r.smembers(key) need_add = [x for x in new_server if x not in old_servers] need_del = [x for x in old_servers if x not in new_server] if need_add: print "%s set add new server ret: %s" % (key,range_r.sadd(key,*need_add)) if need_del: print "%s set del old server ret: %s" % (key,range_r.srem(key,*need_del)) else: print "new %s set add new server ret: %s" %(key,range_r.sadd(key,*new_server)) (start,idc,product,service)=key.split(':') for s in new_server: if s not in server_info:server_info[s] = dict() server_info[s]['idc'] = idc server_info[s]['production'] = product server_info[s]['service'] = service # del key which not contain in data_set keys = range_r.keys('server:*') delKey = [ x for x in keys if x not in data_set.keys() ] if delKey: print 'delete keys from range db %s ' % range_r.delete(*delKey) old_server = host_r.keys('*') del_server = [ x for x in old_server if x not in server_info ] for s,v in server_info.items(): host_r.hmset(s,v) if del_server: print "delete old servers not in new server, %s " % host_r.delete(*del_server)
class AgentMonitor(multiprocessing.Process): def __init__(self, add_queue, del_queue, **conf): multiprocessing.Process.__init__(self) self.conf = conf self.ranger = MagicShow(**self.conf['database']) self.alertr = RedisClient(db='alert', **self.conf['database']) # save all servers of this process. self.total_server = list() # queue for add new servers. self.add_queue = add_queue # queue for del deleted servers. self.del_queue = del_queue # vars for loop. self.alive = True # vars for saving client data. self.data_info = dict() # vars for saveing socket. self.connections = dict() # save fd's sockert. self.filenos = dict() # save some timestamp. self.timestamps = dict() # save new coming data. self.requests = dict() # epoll object. self.epoll = select.epoll() # save each host's configs. self.host_configs = dict() def init_connection(self): if 'client_status' not in self.data_info: self.data_info['client_status'] = dict() sock_timeout = 0.01 for host in self.total_server: # if connection is not fail, conntinue. if host in self.connections:continue sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) optval = struct.pack('ii', 1, 0) sock.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, optval) sock.settimeout(sock_timeout) try:sock.connect((host, 1021)) except socket.error as msg: self.data_info['client_status'][host] = json.dumps({'data' : 'Connection Failed', \ 'time' : int(time())}) else: sock.settimeout(None) sock.setblocking(0) self.data_info['client_status'][host] = json.dumps({'data' : 'Connection Success', \ 'time' : int(time())}) fileno = sock.fileno() self.connections[host] = sock self.filenos[fileno] = host self.requests[host] = '' self.epoll.register(fileno, select.EPOLLIN) for host in set(self.connections) - set(self.total_server): sock = self.connections[host] #delete information of delete servers self.data_info['client_status'].pop(host, None) self.recycle_connection(sock) def send_data(self, sock, data): data = "%010d%s" % (len(data), data) try: sock.sendall(data) except socket.error as msg: self.recycle_connection(sock) def list_servers(self, regex): return self.ranger.load(regex).show() def recycle_connection(self, sock): fileno = sock.fileno() host = self.filenos[fileno] # record failed clients. self.data_info['client_status'][host] = json.dumps({'data' : 'Connection Failed',\ 'time' : int(time())}) # cancel epoll event. self.epoll.unregister(fileno) # close socket self.connections[host].close() # delete records. del self.connections[host] del self.requests[host] del self.filenos[fileno] def get_config(self): # get all configs about client configs = self.alertr.hgetall('total:client:collector:configs') find_host = dict() final_data = dict() data = dict() for item, conf in configs.items(): try: conf = json.loads(conf) except Exception, e: print str(e) continue target = conf.pop('target') # cache expr which already select in redis. if target not in find_host: hosts = find_host[target] = self.list_servers(target) else: hosts = find_host[target] for host in hosts: if host not in data:data[host] = list() data[host].append(conf) final_data = dict((k, json.dumps({'action' : 'NEWCONFIG', 'data' : v})) for k, v in data.items()) return final_data if data else {}