Beispiel #1
0
class BlackManager(object):
    def __init__(self,**conf):
        self.blackr  = RedisClient(db='black', **conf)
        self.ranger = MagicShow(**conf)

    def addbl(self,data):
        expire = data['expire']
        hosts = self.ranger.load(data['range']).show()
        if hosts == 'all':
            key = 'black_%s_%s' % (data['policy'],host)
            self.blackr.set(key,'black',ex=expire)
        else:
            for host in hosts:
                key = 'black_%s_%s' % (data['policy'],host)
                self.blackr.set(key,'black',ex=expire)

    def delbl(self,data):
        print data
        policy = data['policy']
        hosts = self.ranger.load(data['range']).show()
        for host in hosts:
            if policy != 'all':
                key = 'black_%s_%s' % (policy,host)
                self.blackr.delete(key)
            else:
                keys = self.blackr.keys('black_*_%s' % host)
                for key in keys:
                    self.blackr.delete(key)

    def check(self,host,policy=None):
        keys = list()
        key = 'black_all_%s' % (host)
        keys.append(key)
        if policy:
            keys.append('black_%s_%s' % (policy,host))
            keys.append('black_%s_all' % (policy))
        for key in keys:
            if self.blackr.exists(key):
                return True
        return False

    def show(self):
        keys = self.blackr.keys('black*')
        for k in keys:
            try:
                typer,policy,host = k.split('_')
                ttl = self.blackr.pttl(k)
            except:
                print "=================", k
            else:
                print "策略:%s, 主机:%s, 剩余时间:%s 秒" % (policy,host,int(ttl/1000))
Beispiel #2
0
class AlarmMain(object):
    def __init__(self, **conf):
        # save yaml conf.
        self.conf = conf
        # redis handler
        self.ranger = MagicShow(**conf['database'])
        self.alertr = RedisClient(db='alert', **conf['database'])
        self.hostr = RedisClient(db='hosts', **conf['database'])
        # blacklist manager.
        self.bm = BlackManager(**conf['database'])
        self.alarm_info = dict()
        # alarm times
        self.alarm_times = dict()
        self.conf_info = dict()
        self.data_info = dict()
        self.time_stamp = dict()
        self.time_stamp['check_time'] = dict()
        self.time_stamp['alarm_time'] = dict()
        # which ones are being checked
        self.alarm_check = dict()
        #
        self.alive = True
        #
        self.check_enqueue = multiprocessing.Queue()
        self.check_dequeue = multiprocessing.Queue()
        self.alarm_enqueue = multiprocessing.Queue()
        self.alarm_dequeue = multiprocessing.Queue()
        #
        self.procs = dict()
        # create xmpp object
        self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database'])
        self.xmpp.start()
        self.xmpp.enter_room()

    def get_conf(self):
        conf_info = dict()
        for conf_type in ['client', 'tcp', 'http']:
            policy_configs = self.alertr.hgetall('total:%s:policy:configs' % conf_type)
            collector_configs = self.alertr.hgetall('total:%s:collector:configs' % conf_type)
            for item, policy_conf in policy_configs.items():
                if 'collector' not in policy_conf:continue
                try:
                    policy_conf = json.loads(policy_conf)
                except Exception as e:
                    continue

                collector_name = policy_conf['collector']
                target = policy_conf.pop('target')
                if collector_name in collector_configs:
                    collector_conf = collector_configs[collector_name]
                else:
                    continue
                try:
                    conf = json.loads(collector_conf)
                except Exception as e:
                    continue
                else:
                    collector_interval = conf['interval']

                policy_conf['interval'] = collector_interval
                targets = self.ranger.load(target).show()
                policy_conf['target'] = targets
                policy_conf['type'] = conf['type']
                if conf_type == 'TCP' or conf_type == 'tcp':
                    policy_conf['port'] = conf['port']
                uniq_key = '%s|%s' % (item, conf_type)
                conf_info[uniq_key] = policy_conf
                if uniq_key not in self.alarm_info:
                    self.alarm_info[uniq_key] = dict()
                if uniq_key not in self.alarm_times:
                    self.alarm_times[uniq_key] = dict()
        return conf_info

    def init_process(self):
        cpu = multiprocessing.cpu_count()
        for i in xrange(cpu*2):
            proc = Process(target=check_data_proc, args=(self.check_enqueue, self.check_dequeue))
            self.procs[proc] = 1
            proc.start()
        for i in xrange(cpu*2):
            proc = Process(target=alarm_data_proc, args=(self.alarm_enqueue, self.alarm_dequeue))
            self.procs[proc] = 1
            proc.start()

    def get_data(self):
        data_info = dict()
        for item, conf in self.conf_info.items():
            result_key = "result:%s" % conf['collector']
            if not self.alertr.exists(result_key):continue
            result_data = self.alertr.hgetall(result_key)
            # too slow if determine, just leave to process.
            self.conf_info[item]['result'] = result_data

    def save_data(self, item, data):
        self.alarm_info[item] = data

    def send_alarm(self, item, data):
        email_error = dict()
        smess_error = dict()
        xmpp_error = dict()
        total_error = list()
        for k, v in data.items():
            for info in v:
                template = info['template']
                match = re.compile('\$(\w+)\$', re.I)
                while True:
                    whether_match = match.search(template)
                    if whether_match:
                        key = whether_match.groups(0)[0]
                        if key in info:
                            template = template.replace('$'+key+'$', str(info[key]), 1)
                        else:
                            template = re.sub(',?$'+key+'$,?', '', template, 1)
                    else:break
                template = re.sub("\n", "", template)
                real_key, host, name, groups = info['real_key'], info['host'], info['name'], info['group']
                host_info, cur_time = self.hostr.hgetall(host), strftime('%H:%M:%S', localtime())
                emessage = '%s|%s|%s|%s|%s'%(k, template, host_info['production'], host_info['idc'], \
                         host_info['service']) if host_info else '%s|%s|%s|%s|%s'%(k, template,'','','')
                xmpp_mess = '[%s]%s -- 机房:%s, 产品线:%s, 服务:%s 报警时间:%s'\
                %(k, template, host_info['idc'], host_info['production'], host_info['service'], cur_time)\
                if host_info else '[%s]%s -- 报警时间:%s' % (k, template, cur_time)
                message = '[%s]%s' % (k, template.replace(host, host.split('?')[0]))
                if 'message' in info and len(info['message']) >= 100:
                    message = message.encode().replace(info['message'],'(output too long,read email)', 1)
                level = str(info['level'])
                readable_time = strftime('%Y-%m-%d %H:%M:%S')
                if self.bm.check(host,name):
                    record = '%s|%s' % (emessage, 'is_black')
                    total_error.append(record)
                    continue

                if k == 'ALARM':
                    record = '%s|%s' % (emessage, 'is_not_black')
                    total_error.append(record)
                    # didn't alarm but recovery
                    if host not in self.alarm_times[item]:
                        self.alarm_times[item][host] = dict()

                    if real_key not in self.alarm_times[item][host]:
                        self.alarm_times[item][host][real_key] = dict()

                    if level not in self.alarm_times[item][host][real_key]:
                        self.alarm_times[item][host][real_key][level] = 0

                    if self.alarm_times[item][host][real_key][level] >= info['limit'] and \
                        info['limit'] != 0:
                        print "%s * message:%s, limitation:%s" % (readable_time, message, info['limit'])
                        continue
                elif k == 'RECOVERY':
                    #recovery but delete
                    try:
                        del self.alarm_info[item][host][real_key]
                    except:
                        pass

                    try:
                        if level in self.alarm_times[item][host][real_key]:
                            del self.alarm_times[item][host][real_key]
                        else:
                            continue
                    except:
                        continue

                mobiles = list()
                emails = list()
                xmpps = list()

                for group in groups:
                    group_info = self.alertr.get('group:'+group)
                    if not group_info: break
                    try:
                        group_info = json.loads(group_info)
                    except Exception as e:
                        print "Error Exception:%s, Group:%s" % (e, group)
                        continue
                    mobile = group_info['mobile'] if 'mobile' in group_info else []
                    email = group_info['email'] if 'email' in group_info else []
                    xmpp = group_info['xmpp'] if 'xmpp' in group_info else []
                    for m in mobile:
                        if m not in mobiles:mobiles.append(m)
                    for e in email:
                        if e not in emails:emails.append(e)
                    for x in xmpp:
                        if x not in xmpps:xmpps.append(x)

                print "%s * message:%s, method:%s" % (readable_time, xmpp_mess, info['method'])
                if self.check_alarm_time(info['period']):
                    # if real_key in self.alarm_info means alarm again
                    if k == "ALARM":
                        self.alarm_times[item][host][real_key][level] += 1
                    method = info['method']
                    if 'email' in method:
                        for e in emails:
                            if e not in email_error:
                                email_error[e] = dict()
                            if k not in email_error[e]:
                                email_error[e][k] = dict()
                            email_error[e][k][emessage] = 1
                    if 'sms' in method:
                        for m in mobiles:
                            if m not in smess_error:
                                smess_error[m] = dict()
                            smess_error[m][message] = 1
                    if 'xmpp' in method:
                        for x in xmpps:
                            if x not in xmpp_error:
                                xmpp_error[x] = dict()
                            xmpp_error[x][xmpp_mess] = 1

        #if policy deleted, delete record in total:alarm:errors
        if not total_error:
            self.alertr.hdel('total:alarm:errors', item)
        else:
            self.alertr.hset('total:alarm:errors', item, json.dumps(total_error))
        if email_error:
            try:send_email(email_error)
            except Exception as e: print str(e)
        if smess_error:
            try:send_smess(smess_error)
            except Exception as e: print str(e)
        if xmpp_error:
            try:send_xmpp(self.xmpp, xmpp_error)
            except Exception as e: print str(e)

    def clean_memory(self):
        curr_items = self.alertr.hkeys('total:alarm:errors')
        for item in [x for x in curr_items if x not in self.alarm_info or not self.alarm_info[x]]:
                self.alertr.hdel('total:alarm:errors', item)
        # delete timstamps
        self.time_stamp['alarm_time'] = dict((k, v) for k, v in self.time_stamp['alarm_time'].items()\
            if k in self.conf_info)
        self.time_stamp['check_time'] = dict((k, v) for k, v in self.time_stamp['check_time'].items()\
            if k in self.conf_info)

    def check_alarm_time(self, period):
        for times in period:
            start,stop = times.split('-')
            if int(stop) == int(start):
                continue
            else:
                cur_hour = localtime()[3]
                if cur_hour <= int(stop) and int(start) <= cur_hour:
                    return True
        return False

    def close_alarm(self, a, b):
        self.xmpp.disconnect()
        exit(1)

    def run(self):
        self.init_process()
        while self.alive:
            for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP):
                signal.signal(sig, self.close_alarm)
            # get all agent policy
            cur_time = int(time())
            if 'conf_data' not in self.time_stamp or cur_time - self.time_stamp['conf_data'] >=15:
                self.conf_info = self.get_conf()
                self.get_data()
                self.time_stamp['conf_data'] = time()
            # put data into check queue.
            for item, conf in self.conf_info.items():
                if item not in self.time_stamp['check_time'] or \
                    cur_time - self.time_stamp['check_time'][item] >= self.conf_info[item]['interval']:
                    self.check_enqueue.put([item, conf, self.alarm_info[item]])
                    self.time_stamp['check_time'][item] = time()
            # read data from check queue.
            while True:
                try:
                    item, data = self.check_dequeue.get(1, 0.1)
                except Queue.Empty:
                    break
                else:
                    self.save_data(item, data)
            # put data into alarm queue.
            for item in self.alarm_info.keys():
                if item not in self.conf_info:
                    del self.alarm_info[item]
                    continue
                else:
                    alarm_time = self.time_stamp['alarm_time']
                    check_time = self.time_stamp['check_time'][item]

                if not self.alarm_info[item]:
                    if item in alarm_time:del alarm_time[item]
                    continue
                if item not in alarm_time or cur_time - alarm_time[item] >= self.conf_info[item]['rate']:
                    self.alarm_enqueue.put([item, self.conf_info[item], self.alarm_info[item]])
                    self.time_stamp['alarm_time'][item] = cur_time
            while True:
                try:
                    item, data = self.alarm_dequeue.get(1, 0.1)
                except Queue.Empty:
                    break
                else:
                    # send alarm data
                    self.send_alarm(item, data)
            # delete recovery errors.
            self.clean_memory()
Beispiel #3
0
def load_host(conf):
    dbconf = conf
    url = "server/api/servers"
    username = "******"
    password = "******"
    data_dict = {}
    data = Ldapapi.get_wrapper(url,username,password,data_dict)
    data_set = dict()
    i=0
    for block in data:
        i+=1
        idc = None if 'idc' not in block else block['idc'].encode('utf-8')
        hostname = block['hostname'].encode('utf-8')
        product = None if 'product' not in block or not block['product'] else \
                    block['product'].encode('utf-8')
        service = None if 'service' not in block or not block['service'] else block['service']
        if isinstance(service,list):
            for s in service:
                s = s.encode('utf-8')
                key = '%s:%s:%s:%s' % ('server',idc,product,s)
                if key.lower() not in data_set: data_set[key.lower()]=list()
                data_set[key.lower()].append(hostname)
        else:
            key = '%s:%s:%s:%s' % ('server',idc,product,service)
            if key.lower() not in data_set: data_set[key.lower()]=list()
            data_set[key.lower()].append(hostname)
    print "all server total:%s" % i
    # check servers of the key
    host_r = RedisClient(db='hosts', **dbconf)
    range_r = RedisClient(db='range', **dbconf)
    server_info = dict()
    for key in data_set:
        new_server = data_set[key]
        if range_r.exists(key):
            old_servers = range_r.smembers(key)
            need_add = [x for x in new_server if x not in old_servers]
            need_del = [x for x in old_servers if x not in new_server]
            if need_add:
                print "%s set add new server ret: %s" % (key,range_r.sadd(key,*need_add))
            if need_del:
                print "%s set del old server ret: %s" % (key,range_r.srem(key,*need_del))
        else:
            print "new %s set add new server ret: %s" %(key,range_r.sadd(key,*new_server))
        (start,idc,product,service)=key.split(':')
        for s in new_server:
            if s not in server_info:server_info[s] = dict()
            server_info[s]['idc'] = idc
            server_info[s]['production'] = product
            server_info[s]['service'] = service
    
    # del key which not contain in data_set
    
    keys = range_r.keys('server:*')
    
    delKey = [ x for x in keys if x not in data_set.keys() ]
    
    if delKey:
        print 'delete keys from range db %s ' % range_r.delete(*delKey)
    
    old_server = host_r.keys('*')
    
    del_server = [ x for x in old_server if x not in server_info ]
    
    for s,v in server_info.items():
        host_r.hmset(s,v)
    if del_server: print "delete old servers not in new server, %s " % host_r.delete(*del_server)