Example #1
0
 def __init__(self, **conf):
     # save yaml conf.
     self.conf = conf
     # redis handler
     self.ranger = MagicShow(**conf['database'])
     self.alertr = RedisClient(db='alert', **conf['database'])
     self.hostr = RedisClient(db='hosts', **conf['database'])
     # blacklist manager.
     self.bm = BlackManager(**conf['database'])
     self.alarm_info = dict()
     # alarm times
     self.alarm_times = dict()
     self.conf_info = dict()
     self.data_info = dict()
     self.time_stamp = dict()
     self.time_stamp['check_time'] = dict()
     self.time_stamp['alarm_time'] = dict()
     # which ones are being checked
     self.alarm_check = dict()
     #
     self.alive = True
     #
     self.check_enqueue = multiprocessing.Queue()
     self.check_dequeue = multiprocessing.Queue()
     self.alarm_enqueue = multiprocessing.Queue()
     self.alarm_dequeue = multiprocessing.Queue()
     #
     self.procs = dict()
     # create xmpp object
     self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database'])
     self.xmpp.start()
     self.xmpp.enter_room()
Example #2
0
class AlarmMain(object):
    def __init__(self, **conf):
        # save yaml conf.
        self.conf = conf
        # redis handler
        self.ranger = MagicShow(**conf['database'])
        self.alertr = RedisClient(db='alert', **conf['database'])
        self.hostr = RedisClient(db='hosts', **conf['database'])
        # blacklist manager.
        self.bm = BlackManager(**conf['database'])
        self.alarm_info = dict()
        # alarm times
        self.alarm_times = dict()
        self.conf_info = dict()
        self.data_info = dict()
        self.time_stamp = dict()
        self.time_stamp['check_time'] = dict()
        self.time_stamp['alarm_time'] = dict()
        # which ones are being checked
        self.alarm_check = dict()
        #
        self.alive = True
        #
        self.check_enqueue = multiprocessing.Queue()
        self.check_dequeue = multiprocessing.Queue()
        self.alarm_enqueue = multiprocessing.Queue()
        self.alarm_dequeue = multiprocessing.Queue()
        #
        self.procs = dict()
        # create xmpp object
        self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database'])
        self.xmpp.start()
        self.xmpp.enter_room()

    def get_conf(self):
        conf_info = dict()
        for conf_type in ['client', 'tcp', 'http']:
            policy_configs = self.alertr.hgetall('total:%s:policy:configs' % conf_type)
            collector_configs = self.alertr.hgetall('total:%s:collector:configs' % conf_type)
            for item, policy_conf in policy_configs.items():
                if 'collector' not in policy_conf:continue
                try:
                    policy_conf = json.loads(policy_conf)
                except Exception as e:
                    continue

                collector_name = policy_conf['collector']
                target = policy_conf.pop('target')
                if collector_name in collector_configs:
                    collector_conf = collector_configs[collector_name]
                else:
                    continue
                try:
                    conf = json.loads(collector_conf)
                except Exception as e:
                    continue
                else:
                    collector_interval = conf['interval']

                policy_conf['interval'] = collector_interval
                targets = self.ranger.load(target).show()
                policy_conf['target'] = targets
                policy_conf['type'] = conf['type']
                if conf_type == 'TCP' or conf_type == 'tcp':
                    policy_conf['port'] = conf['port']
                uniq_key = '%s|%s' % (item, conf_type)
                conf_info[uniq_key] = policy_conf
                if uniq_key not in self.alarm_info:
                    self.alarm_info[uniq_key] = dict()
                if uniq_key not in self.alarm_times:
                    self.alarm_times[uniq_key] = dict()
        return conf_info

    def init_process(self):
        cpu = multiprocessing.cpu_count()
        for i in xrange(cpu*2):
            proc = Process(target=check_data_proc, args=(self.check_enqueue, self.check_dequeue))
            self.procs[proc] = 1
            proc.start()
        for i in xrange(cpu*2):
            proc = Process(target=alarm_data_proc, args=(self.alarm_enqueue, self.alarm_dequeue))
            self.procs[proc] = 1
            proc.start()

    def get_data(self):
        data_info = dict()
        for item, conf in self.conf_info.items():
            result_key = "result:%s" % conf['collector']
            if not self.alertr.exists(result_key):continue
            result_data = self.alertr.hgetall(result_key)
            # too slow if determine, just leave to process.
            self.conf_info[item]['result'] = result_data

    def save_data(self, item, data):
        self.alarm_info[item] = data

    def send_alarm(self, item, data):
        email_error = dict()
        smess_error = dict()
        xmpp_error = dict()
        total_error = list()
        for k, v in data.items():
            for info in v:
                template = info['template']
                match = re.compile('\$(\w+)\$', re.I)
                while True:
                    whether_match = match.search(template)
                    if whether_match:
                        key = whether_match.groups(0)[0]
                        if key in info:
                            template = template.replace('$'+key+'$', str(info[key]), 1)
                        else:
                            template = re.sub(',?$'+key+'$,?', '', template, 1)
                    else:break
                template = re.sub("\n", "", template)
                real_key, host, name, groups = info['real_key'], info['host'], info['name'], info['group']
                host_info, cur_time = self.hostr.hgetall(host), strftime('%H:%M:%S', localtime())
                emessage = '%s|%s|%s|%s|%s'%(k, template, host_info['production'], host_info['idc'], \
                         host_info['service']) if host_info else '%s|%s|%s|%s|%s'%(k, template,'','','')
                xmpp_mess = '[%s]%s -- 机房:%s, 产品线:%s, 服务:%s 报警时间:%s'\
                %(k, template, host_info['idc'], host_info['production'], host_info['service'], cur_time)\
                if host_info else '[%s]%s -- 报警时间:%s' % (k, template, cur_time)
                message = '[%s]%s' % (k, template.replace(host, host.split('?')[0]))
                if 'message' in info and len(info['message']) >= 100:
                    message = message.encode().replace(info['message'],'(output too long,read email)', 1)
                level = str(info['level'])
                readable_time = strftime('%Y-%m-%d %H:%M:%S')
                if self.bm.check(host,name):
                    record = '%s|%s' % (emessage, 'is_black')
                    total_error.append(record)
                    continue

                if k == 'ALARM':
                    record = '%s|%s' % (emessage, 'is_not_black')
                    total_error.append(record)
                    # didn't alarm but recovery
                    if host not in self.alarm_times[item]:
                        self.alarm_times[item][host] = dict()

                    if real_key not in self.alarm_times[item][host]:
                        self.alarm_times[item][host][real_key] = dict()

                    if level not in self.alarm_times[item][host][real_key]:
                        self.alarm_times[item][host][real_key][level] = 0

                    if self.alarm_times[item][host][real_key][level] >= info['limit'] and \
                        info['limit'] != 0:
                        print "%s * message:%s, limitation:%s" % (readable_time, message, info['limit'])
                        continue
                elif k == 'RECOVERY':
                    #recovery but delete
                    try:
                        del self.alarm_info[item][host][real_key]
                    except:
                        pass

                    try:
                        if level in self.alarm_times[item][host][real_key]:
                            del self.alarm_times[item][host][real_key]
                        else:
                            continue
                    except:
                        continue

                mobiles = list()
                emails = list()
                xmpps = list()

                for group in groups:
                    group_info = self.alertr.get('group:'+group)
                    if not group_info: break
                    try:
                        group_info = json.loads(group_info)
                    except Exception as e:
                        print "Error Exception:%s, Group:%s" % (e, group)
                        continue
                    mobile = group_info['mobile'] if 'mobile' in group_info else []
                    email = group_info['email'] if 'email' in group_info else []
                    xmpp = group_info['xmpp'] if 'xmpp' in group_info else []
                    for m in mobile:
                        if m not in mobiles:mobiles.append(m)
                    for e in email:
                        if e not in emails:emails.append(e)
                    for x in xmpp:
                        if x not in xmpps:xmpps.append(x)

                print "%s * message:%s, method:%s" % (readable_time, xmpp_mess, info['method'])
                if self.check_alarm_time(info['period']):
                    # if real_key in self.alarm_info means alarm again
                    if k == "ALARM":
                        self.alarm_times[item][host][real_key][level] += 1
                    method = info['method']
                    if 'email' in method:
                        for e in emails:
                            if e not in email_error:
                                email_error[e] = dict()
                            if k not in email_error[e]:
                                email_error[e][k] = dict()
                            email_error[e][k][emessage] = 1
                    if 'sms' in method:
                        for m in mobiles:
                            if m not in smess_error:
                                smess_error[m] = dict()
                            smess_error[m][message] = 1
                    if 'xmpp' in method:
                        for x in xmpps:
                            if x not in xmpp_error:
                                xmpp_error[x] = dict()
                            xmpp_error[x][xmpp_mess] = 1

        #if policy deleted, delete record in total:alarm:errors
        if not total_error:
            self.alertr.hdel('total:alarm:errors', item)
        else:
            self.alertr.hset('total:alarm:errors', item, json.dumps(total_error))
        if email_error:
            try:send_email(email_error)
            except Exception as e: print str(e)
        if smess_error:
            try:send_smess(smess_error)
            except Exception as e: print str(e)
        if xmpp_error:
            try:send_xmpp(self.xmpp, xmpp_error)
            except Exception as e: print str(e)

    def clean_memory(self):
        curr_items = self.alertr.hkeys('total:alarm:errors')
        for item in [x for x in curr_items if x not in self.alarm_info or not self.alarm_info[x]]:
                self.alertr.hdel('total:alarm:errors', item)
        # delete timstamps
        self.time_stamp['alarm_time'] = dict((k, v) for k, v in self.time_stamp['alarm_time'].items()\
            if k in self.conf_info)
        self.time_stamp['check_time'] = dict((k, v) for k, v in self.time_stamp['check_time'].items()\
            if k in self.conf_info)

    def check_alarm_time(self, period):
        for times in period:
            start,stop = times.split('-')
            if int(stop) == int(start):
                continue
            else:
                cur_hour = localtime()[3]
                if cur_hour <= int(stop) and int(start) <= cur_hour:
                    return True
        return False

    def close_alarm(self, a, b):
        self.xmpp.disconnect()
        exit(1)

    def run(self):
        self.init_process()
        while self.alive:
            for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP):
                signal.signal(sig, self.close_alarm)
            # get all agent policy
            cur_time = int(time())
            if 'conf_data' not in self.time_stamp or cur_time - self.time_stamp['conf_data'] >=15:
                self.conf_info = self.get_conf()
                self.get_data()
                self.time_stamp['conf_data'] = time()
            # put data into check queue.
            for item, conf in self.conf_info.items():
                if item not in self.time_stamp['check_time'] or \
                    cur_time - self.time_stamp['check_time'][item] >= self.conf_info[item]['interval']:
                    self.check_enqueue.put([item, conf, self.alarm_info[item]])
                    self.time_stamp['check_time'][item] = time()
            # read data from check queue.
            while True:
                try:
                    item, data = self.check_dequeue.get(1, 0.1)
                except Queue.Empty:
                    break
                else:
                    self.save_data(item, data)
            # put data into alarm queue.
            for item in self.alarm_info.keys():
                if item not in self.conf_info:
                    del self.alarm_info[item]
                    continue
                else:
                    alarm_time = self.time_stamp['alarm_time']
                    check_time = self.time_stamp['check_time'][item]

                if not self.alarm_info[item]:
                    if item in alarm_time:del alarm_time[item]
                    continue
                if item not in alarm_time or cur_time - alarm_time[item] >= self.conf_info[item]['rate']:
                    self.alarm_enqueue.put([item, self.conf_info[item], self.alarm_info[item]])
                    self.time_stamp['alarm_time'][item] = cur_time
            while True:
                try:
                    item, data = self.alarm_dequeue.get(1, 0.1)
                except Queue.Empty:
                    break
                else:
                    # send alarm data
                    self.send_alarm(item, data)
            # delete recovery errors.
            self.clean_memory()