Exemple #1
0
def monitor_main_proc(**conf):
    host_add_queue = dict()
    host_del_queue = dict()
    monitor = dict()
    hosts = dict()
    rangr = MagicShow(**conf['database'])
    current_server = rangr.load(conf['totalserver']).show()
    cpuNum = multiprocessing.cpu_count()
    # process number, times of cpu count.
    n = 2
    # batch servers and put them into right queue.
    # cause AgentMonitor need maintain many long time connections,
    # structure is a bit difference from http and tcp monitoris.
    for i in xrange(n * cpuNum):
        host_add_queue[i] = multiprocessing.Queue()
        host_del_queue[i] = multiprocessing.Queue()
        monitor[i] = AgentMonitor(host_add_queue[i], host_del_queue[i], **conf)
        monitor[i].start()
        hosts[i] = [current_server[x] for x in xrange(len(current_server)) if x % (n * cpuNum) == i]
        host_add_queue[i].put(hosts[i])

    # start http monitor
    HTTPMonitor(**conf).start()

    # start tcp monitor
    TCPMonitor(**conf).start()
    
    while True:
        # read servers from asset2.
        try:
            time_stamp = strftime('%Y-%m-%d %H:%M:%S', localtime())
            print time_stamp, 'start to load host'
            # sometimes t.a.wandoulabs.com down!
            load_host(conf['database'])
        except:
            sleep(30)
            continue
        else:
            time_stamp = strftime('%Y-%m-%d %H:%M:%S', localtime())
            print time_stamp, 'finish load host'
        new_server = rangr.load(conf['totalserver']).show()
        if new_server != current_server:
            del_server = list(set(current_server).difference(set(new_server)))
            add_server = list(set(new_server).difference(set(current_server)))
            for server in del_server:
                for i in hosts:
                    # put deleted server in del queue.
                    if server in hosts[i]:
                        print time_stamp, 'delete server in hosts[%s]' % i
                        host_del_queue[i].put([server])
                    hosts[i] = [x for x in hosts[i] if x != server]
            for server in add_server:
                # sort as number of servers in each process.
                sort_hosts = sorted(hosts.iteritems(), key=lambda x:len(x[1]))
                num = sort_hosts[0][0]
                print time_stamp, 'add server in hosts[%s]' % num
                host_add_queue[num].put([server])
                hosts[num].append(server)
            current_server = new_server
        sleep(60)
Exemple #2
0
class AlarmMain(object):
    def __init__(self, **conf):
        # save yaml conf.
        self.conf = conf
        # redis handler
        self.ranger = MagicShow(**conf['database'])
        self.alertr = RedisClient(db='alert', **conf['database'])
        self.hostr = RedisClient(db='hosts', **conf['database'])
        # blacklist manager.
        self.bm = BlackManager(**conf['database'])
        self.alarm_info = dict()
        # alarm times
        self.alarm_times = dict()
        self.conf_info = dict()
        self.data_info = dict()
        self.time_stamp = dict()
        self.time_stamp['check_time'] = dict()
        self.time_stamp['alarm_time'] = dict()
        # which ones are being checked
        self.alarm_check = dict()
        #
        self.alive = True
        #
        self.check_enqueue = multiprocessing.Queue()
        self.check_dequeue = multiprocessing.Queue()
        self.alarm_enqueue = multiprocessing.Queue()
        self.alarm_dequeue = multiprocessing.Queue()
        #
        self.procs = dict()
        # create xmpp object
        self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database'])
        self.xmpp.start()
        self.xmpp.enter_room()

    def get_conf(self):
        conf_info = dict()
        for conf_type in ['client', 'tcp', 'http']:
            policy_configs = self.alertr.hgetall('total:%s:policy:configs' % conf_type)
            collector_configs = self.alertr.hgetall('total:%s:collector:configs' % conf_type)
            for item, policy_conf in policy_configs.items():
                if 'collector' not in policy_conf:continue
                try:
                    policy_conf = json.loads(policy_conf)
                except Exception as e:
                    continue

                collector_name = policy_conf['collector']
                target = policy_conf.pop('target')
                if collector_name in collector_configs:
                    collector_conf = collector_configs[collector_name]
                else:
                    continue
                try:
                    conf = json.loads(collector_conf)
                except Exception as e:
                    continue
                else:
                    collector_interval = conf['interval']

                policy_conf['interval'] = collector_interval
                targets = self.ranger.load(target).show()
                policy_conf['target'] = targets
                policy_conf['type'] = conf['type']
                if conf_type == 'TCP' or conf_type == 'tcp':
                    policy_conf['port'] = conf['port']
                uniq_key = '%s|%s' % (item, conf_type)
                conf_info[uniq_key] = policy_conf
                if uniq_key not in self.alarm_info:
                    self.alarm_info[uniq_key] = dict()
                if uniq_key not in self.alarm_times:
                    self.alarm_times[uniq_key] = dict()
        return conf_info

    def init_process(self):
        cpu = multiprocessing.cpu_count()
        for i in xrange(cpu*2):
            proc = Process(target=check_data_proc, args=(self.check_enqueue, self.check_dequeue))
            self.procs[proc] = 1
            proc.start()
        for i in xrange(cpu*2):
            proc = Process(target=alarm_data_proc, args=(self.alarm_enqueue, self.alarm_dequeue))
            self.procs[proc] = 1
            proc.start()

    def get_data(self):
        data_info = dict()
        for item, conf in self.conf_info.items():
            result_key = "result:%s" % conf['collector']
            if not self.alertr.exists(result_key):continue
            result_data = self.alertr.hgetall(result_key)
            # too slow if determine, just leave to process.
            self.conf_info[item]['result'] = result_data

    def save_data(self, item, data):
        self.alarm_info[item] = data

    def send_alarm(self, item, data):
        email_error = dict()
        smess_error = dict()
        xmpp_error = dict()
        total_error = list()
        for k, v in data.items():
            for info in v:
                template = info['template']
                match = re.compile('\$(\w+)\$', re.I)
                while True:
                    whether_match = match.search(template)
                    if whether_match:
                        key = whether_match.groups(0)[0]
                        if key in info:
                            template = template.replace('$'+key+'$', str(info[key]), 1)
                        else:
                            template = re.sub(',?$'+key+'$,?', '', template, 1)
                    else:break
                template = re.sub("\n", "", template)
                real_key, host, name, groups = info['real_key'], info['host'], info['name'], info['group']
                host_info, cur_time = self.hostr.hgetall(host), strftime('%H:%M:%S', localtime())
                emessage = '%s|%s|%s|%s|%s'%(k, template, host_info['production'], host_info['idc'], \
                         host_info['service']) if host_info else '%s|%s|%s|%s|%s'%(k, template,'','','')
                xmpp_mess = '[%s]%s -- 机房:%s, 产品线:%s, 服务:%s 报警时间:%s'\
                %(k, template, host_info['idc'], host_info['production'], host_info['service'], cur_time)\
                if host_info else '[%s]%s -- 报警时间:%s' % (k, template, cur_time)
                message = '[%s]%s' % (k, template.replace(host, host.split('?')[0]))
                if 'message' in info and len(info['message']) >= 100:
                    message = message.encode().replace(info['message'],'(output too long,read email)', 1)
                level = str(info['level'])
                readable_time = strftime('%Y-%m-%d %H:%M:%S')
                if self.bm.check(host,name):
                    record = '%s|%s' % (emessage, 'is_black')
                    total_error.append(record)
                    continue

                if k == 'ALARM':
                    record = '%s|%s' % (emessage, 'is_not_black')
                    total_error.append(record)
                    # didn't alarm but recovery
                    if host not in self.alarm_times[item]:
                        self.alarm_times[item][host] = dict()

                    if real_key not in self.alarm_times[item][host]:
                        self.alarm_times[item][host][real_key] = dict()

                    if level not in self.alarm_times[item][host][real_key]:
                        self.alarm_times[item][host][real_key][level] = 0

                    if self.alarm_times[item][host][real_key][level] >= info['limit'] and \
                        info['limit'] != 0:
                        print "%s * message:%s, limitation:%s" % (readable_time, message, info['limit'])
                        continue
                elif k == 'RECOVERY':
                    #recovery but delete
                    try:
                        del self.alarm_info[item][host][real_key]
                    except:
                        pass

                    try:
                        if level in self.alarm_times[item][host][real_key]:
                            del self.alarm_times[item][host][real_key]
                        else:
                            continue
                    except:
                        continue

                mobiles = list()
                emails = list()
                xmpps = list()

                for group in groups:
                    group_info = self.alertr.get('group:'+group)
                    if not group_info: break
                    try:
                        group_info = json.loads(group_info)
                    except Exception as e:
                        print "Error Exception:%s, Group:%s" % (e, group)
                        continue
                    mobile = group_info['mobile'] if 'mobile' in group_info else []
                    email = group_info['email'] if 'email' in group_info else []
                    xmpp = group_info['xmpp'] if 'xmpp' in group_info else []
                    for m in mobile:
                        if m not in mobiles:mobiles.append(m)
                    for e in email:
                        if e not in emails:emails.append(e)
                    for x in xmpp:
                        if x not in xmpps:xmpps.append(x)

                print "%s * message:%s, method:%s" % (readable_time, xmpp_mess, info['method'])
                if self.check_alarm_time(info['period']):
                    # if real_key in self.alarm_info means alarm again
                    if k == "ALARM":
                        self.alarm_times[item][host][real_key][level] += 1
                    method = info['method']
                    if 'email' in method:
                        for e in emails:
                            if e not in email_error:
                                email_error[e] = dict()
                            if k not in email_error[e]:
                                email_error[e][k] = dict()
                            email_error[e][k][emessage] = 1
                    if 'sms' in method:
                        for m in mobiles:
                            if m not in smess_error:
                                smess_error[m] = dict()
                            smess_error[m][message] = 1
                    if 'xmpp' in method:
                        for x in xmpps:
                            if x not in xmpp_error:
                                xmpp_error[x] = dict()
                            xmpp_error[x][xmpp_mess] = 1

        #if policy deleted, delete record in total:alarm:errors
        if not total_error:
            self.alertr.hdel('total:alarm:errors', item)
        else:
            self.alertr.hset('total:alarm:errors', item, json.dumps(total_error))
        if email_error:
            try:send_email(email_error)
            except Exception as e: print str(e)
        if smess_error:
            try:send_smess(smess_error)
            except Exception as e: print str(e)
        if xmpp_error:
            try:send_xmpp(self.xmpp, xmpp_error)
            except Exception as e: print str(e)

    def clean_memory(self):
        curr_items = self.alertr.hkeys('total:alarm:errors')
        for item in [x for x in curr_items if x not in self.alarm_info or not self.alarm_info[x]]:
                self.alertr.hdel('total:alarm:errors', item)
        # delete timstamps
        self.time_stamp['alarm_time'] = dict((k, v) for k, v in self.time_stamp['alarm_time'].items()\
            if k in self.conf_info)
        self.time_stamp['check_time'] = dict((k, v) for k, v in self.time_stamp['check_time'].items()\
            if k in self.conf_info)

    def check_alarm_time(self, period):
        for times in period:
            start,stop = times.split('-')
            if int(stop) == int(start):
                continue
            else:
                cur_hour = localtime()[3]
                if cur_hour <= int(stop) and int(start) <= cur_hour:
                    return True
        return False

    def close_alarm(self, a, b):
        self.xmpp.disconnect()
        exit(1)

    def run(self):
        self.init_process()
        while self.alive:
            for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP):
                signal.signal(sig, self.close_alarm)
            # get all agent policy
            cur_time = int(time())
            if 'conf_data' not in self.time_stamp or cur_time - self.time_stamp['conf_data'] >=15:
                self.conf_info = self.get_conf()
                self.get_data()
                self.time_stamp['conf_data'] = time()
            # put data into check queue.
            for item, conf in self.conf_info.items():
                if item not in self.time_stamp['check_time'] or \
                    cur_time - self.time_stamp['check_time'][item] >= self.conf_info[item]['interval']:
                    self.check_enqueue.put([item, conf, self.alarm_info[item]])
                    self.time_stamp['check_time'][item] = time()
            # read data from check queue.
            while True:
                try:
                    item, data = self.check_dequeue.get(1, 0.1)
                except Queue.Empty:
                    break
                else:
                    self.save_data(item, data)
            # put data into alarm queue.
            for item in self.alarm_info.keys():
                if item not in self.conf_info:
                    del self.alarm_info[item]
                    continue
                else:
                    alarm_time = self.time_stamp['alarm_time']
                    check_time = self.time_stamp['check_time'][item]

                if not self.alarm_info[item]:
                    if item in alarm_time:del alarm_time[item]
                    continue
                if item not in alarm_time or cur_time - alarm_time[item] >= self.conf_info[item]['rate']:
                    self.alarm_enqueue.put([item, self.conf_info[item], self.alarm_info[item]])
                    self.time_stamp['alarm_time'][item] = cur_time
            while True:
                try:
                    item, data = self.alarm_dequeue.get(1, 0.1)
                except Queue.Empty:
                    break
                else:
                    # send alarm data
                    self.send_alarm(item, data)
            # delete recovery errors.
            self.clean_memory()
Exemple #3
0
class AgentMonitor(multiprocessing.Process):
    def __init__(self, add_queue, del_queue, **conf):
        multiprocessing.Process.__init__(self)
        self.conf = conf
        self.ranger = MagicShow(**self.conf['database'])
        self.alertr = RedisClient(db='alert', **self.conf['database'])
        # save all servers of this process.
        self.total_server = list()
        # queue for add new servers.
        self.add_queue = add_queue
        # queue for del deleted servers.
        self.del_queue = del_queue
        # vars for loop.
        self.alive = True
        # vars for saving client data.
        self.data_info = dict()
        # vars for saveing socket.
        self.connections = dict()
        # save fd's sockert.
        self.filenos = dict()
        # save some timestamp.
        self.timestamps = dict()
        # save new coming data.
        self.requests = dict()
        # epoll object.
        self.epoll = select.epoll()
        # save each host's configs.
        self.host_configs = dict()

    def init_connection(self):
        if 'client_status' not in self.data_info:
            self.data_info['client_status'] = dict()
        sock_timeout = 0.01
        for host in self.total_server:
            # if connection is not fail, conntinue.
            if host in self.connections:continue
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
            optval = struct.pack('ii', 1, 0)
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, optval)
            sock.settimeout(sock_timeout)
            try:sock.connect((host, 1021))
            except socket.error as msg:
                self.data_info['client_status'][host] = json.dumps({'data' : 'Connection Failed', \
                                                            'time' : int(time())})
            else:
                sock.settimeout(None)
                sock.setblocking(0)
                self.data_info['client_status'][host] = json.dumps({'data' : 'Connection Success', \
                                                            'time' : int(time())})
                fileno = sock.fileno()
                self.connections[host] = sock
                self.filenos[fileno] = host
                self.requests[host] = ''
                self.epoll.register(fileno, select.EPOLLIN)
        for host in set(self.connections) - set(self.total_server):
            sock = self.connections[host]
            #delete information of delete servers
            self.data_info['client_status'].pop(host, None)
            self.recycle_connection(sock)
            

    def send_data(self, sock, data):
        data = "%010d%s" % (len(data), data)
        try:
            sock.sendall(data)
        except socket.error as msg:
            self.recycle_connection(sock)

    def list_servers(self, regex):
        return self.ranger.load(regex).show()

    def recycle_connection(self, sock):
        fileno = sock.fileno()
        host = self.filenos[fileno]
        # record failed clients.
        self.data_info['client_status'][host] = json.dumps({'data' : 'Connection Failed',\
            'time' : int(time())})
        # cancel epoll event.
        self.epoll.unregister(fileno)
        # close socket
        self.connections[host].close()
        # delete records.
        del self.connections[host]
        del self.requests[host]
        del self.filenos[fileno]

    def get_config(self):
        # get all configs about client
        configs = self.alertr.hgetall('total:client:collector:configs')
        find_host = dict()
        final_data = dict()
        data = dict()
        for item, conf in configs.items():
            try:
                conf = json.loads(conf)
            except Exception, e:
                print str(e)
                continue
            target = conf.pop('target')
            # cache expr which already select in redis.
            if target not in find_host:
                hosts = find_host[target] = self.list_servers(target)
            else:
                hosts = find_host[target]

            for host in hosts:
                if host not in data:data[host] = list()
                data[host].append(conf)
        final_data = dict((k, json.dumps({'action' : 'NEWCONFIG', 'data' : v})) for k, v in data.items())
        return final_data if data else {}