for item in [{ 'name': u'loss_rate', 'comment': u'网络丢包率'.encode('utf-8'), 'unit': '%' }, { 'name': u'delay', 'comment': u'网络延迟'.encode('utf-8'), 'unit': 'ms' }]: session.add(MonitorItem(**item)) # session.add(MonitorRegion(name='beijing-1-qu', _serverips=[MonitorServerIp(ip='10.40.44.2')])) # session.add(MonitorRegion(name='beijing-2-qu', _serverips=[ # MonitorServerIp(ip='120.1.12.3'), # # MonitorServerIp(ip='127.0.0.3'), # ])) session.commit() # item_ids = session.query(MonitorItem).all() # triggers_list = [ # { # "period": "1min", # "time": "5c", # "repeat": "1h", # "compare": ">", # "threshold": 90, # "item_id": item_ids[0].id # }, # { # "period": "5min", # "time": "2c", # "repeat": "2h",
class MonitorAlertHandler(RequestHandler): def initialize(self): self.db_session = Session() self.redis = RedisCache() def on_finish(self): self.db_session.close() @except_handler() def get(self, req_data): form = MonitorAlertDashGetForm(ImmutableMultiDict(req_data)) if not form.validate(): raise Status400(form.errors) form_data = form.data.copy() region = form_data.pop('region') ser_instance = self.db_session.query(MonitorRegion).get(region) if not ser_instance: raise Status403('region id is invalid') server_list = [i.ip for i in ser_instance._serverips] form_data['broadband'] = form_data['broadband'].split(',') origin_data = es_client.get_region_record( settings.opt['ELASTICSEARCH']['INDEX'], servers=server_list, **form_data) for item in origin_data: if form_data['datatype'] == 'delay': item['level'] = MonitorAlertHandler.set_level( form_data['datatype'], delay=item['delay']) elif form_data['datatype'] == 'loss_rate': item['level'] = MonitorAlertHandler.set_level( form_data['datatype'], loss_rate=item['loss_rate']) else: item['level'] = MonitorAlertHandler.set_level( form_data['datatype'], item['delay'], item['loss_rate']) return {'return': 0, 'message': 'success', 'instance': origin_data} @staticmethod def set_level(datatype, delay=0, loss_rate=0): loss_level, delay_level = 0, 0 if datatype == 'loss_rate' or datatype == 'all': if loss_rate >= 0 and loss_rate <= 0.5: loss_level = 0 elif loss_rate >= 0.6 and loss_rate <= 1.0: loss_level = 1 elif loss_rate >= 1.1 and loss_rate <= 2.0: loss_level = 2 elif loss_rate >= 2.1 and loss_rate <= 5.0: loss_level = 3 else: loss_level = 4 if datatype == 'delay' or datatype == 'all': if delay >= 0 and delay <= 50: delay_level = 0 elif delay > 50 and delay <= 100: delay_level = 1 elif delay > 100 and delay <= 150: delay_level = 2 elif delay > 150 and delay <= 200: delay_level = 3 else: delay_level = 4 if datatype == 'loss_rate': return loss_level elif datatype == 'delay': return delay_level else: return delay_level if delay_level > loss_level else loss_level ''' serialize original data input: [ "_id": xxxxxx, "_source": { "datetime": '2017-08-01 12:12:00' "province": 'beijing' "county": '*' "server": '192.168.152.22' "data": '{ 'province': 'beijing', 'country': 'china', 'county': '*', 'server': '10.40.44.2', 'delay': 88, 'loss_rate': 92, 'time': 1502071380, 'broadband': 'xxx', 'counter': 29, 'app_id': '0987654321', 'max_delay': 145, 'min_delay': 64, 'type': 'network', 'devices': 1, }' } ... ] trigger和severip联合识别,一个报警对应于一个trigger,一个serverip组成的共同记录 ''' def alert_handler(self, req_data): alert_result = [] # province = req_data['province'] # county = req_data['county'] serverip = self.db_session.query(MonitorServerIp).filter( MonitorServerIp.ip == req_data['server']).first() if not serverip: return alert_result region = self.db_session.query(MonitorRegion).filter( MonitorRegion._serverips.contains(serverip)).first() if not region: return alert_result rules = self.db_session.query(MonitorRule).filter( MonitorRule._regions.contains(region)).all() for r in rules: for r_trigger in r._triggers: ''' 基于trigger, 从ES中取数据基于trigger.period ''' time_int = TimeInt[r_trigger.time] stoptime = datetime.datetime.strptime(req_data['time'], '%Y-%m-%d %H:%M:%S') t_delta = PeriodInt[r_trigger.period] * time_int starttime = stoptime + datetime.timedelta(minutes=-t_delta) history_data = es_client.get_record( settings.opt['ELASTICSEARCH']['INDEX'], starttime, stoptime, req_data['province'], req_data['county'], req_data['broadband'], req_data['server'], req_data['app_id'], PeriodInt[r_trigger.period], r_trigger._item.name) logger.debug('{0}'.format(r_trigger._item.name)) logger.debug('{0}'.format(history_data)) logger.debug('{0}'.format(r_trigger.time)) alert_ret = MonitorAlertHandler.alert_check( history_data, time_int, r_trigger.threshold, r_trigger.compare) if alert_ret is True: alert_result.append({ 'alert': True, 'trigger': r_trigger, 'region': r, 'data': history_data[1:] }) else: alert_result.append({ 'alert': False, 'trigger': r_trigger, 'region': r, 'data': history_data[1:] }) return alert_result @staticmethod def alert_check(data_list, last_time, threshold, compare): if last_time > len(data_list): return False for value in data_list[0:last_time]: if compare == '>': ret = True if value > threshold else False elif compare == '<': ret = True if value < threshold else False else: ret = True if value == threshold else False if not ret: break return ret ''' trig_id: MonitorTrigger id output: 告警已经确认: True 告警未确认: False ''' def check_confirm(self, trig_id, ip, province, county, broadband, appid): flag = 'cfm:{0};{1};{2};{3};{4};{5}'.format(trig_id, ip, province, county, broadband, appid) time_pre = self.redis.get_data(flag) if not time_pre: return False return True ''' input: MonitorTrigger instance output: 在重复时间段内: 'in_repeat' 不在重复时间段内: 'out_repeat' 之前无告警: 'no_alert' ''' def check_alert(self, trig, ip, province, county, broadband, appid, time_now): if trig.repeat == 'no_repeat': return 'out_repeat' flag = 'trig:{0};{1};{2};{3};{4};{5}'.format(trig.id, ip, province, county, broadband, appid) time_pre = self.redis.get_data(flag) if not time_pre: self.redis.set_data(flag, time_now) return 'no_alert' get_time = datetime.datetime.strptime time_delta = get_time(time_now, '%Y-%m-%d %H:%M:%S') - get_time( time_pre, '%Y-%m-%d %H:%M:%S') if time_delta.seconds > RepeatInt[trig.repeat] * 60: self.redis.set_data(flag, time_now) return 'out_repeat' return 'in_repeat' ''' input: MonitorTrigger instance output: 之前处于报警状态: True 之前不处于报警状态: False ''' def cancel_alert(self, trig_id, ip, province, county, broadband, appid): flag = 'trig:{0};{1};{2};{3};{4};{5}'.format(trig_id, ip, province, county, broadband, appid) time_pre = self.redis.del_data(flag) if not time_pre: return False flag_cfm = 'cfm:{0};{1};{2};{3};{4};{5}'.format( trig_id, ip, province, county, broadband, appid) # 删除报警确认信息 self.redis.del_data(flag_cfm) self.redis.del_data(flag) return True @staticmethod def request_data_format(req_data): key_list = [ 'province', 'country', 'county', 'server', 'delay', 'loss_rate', 'time', 'broadband', 'counter', 'app_id', 'max_delay', 'min_delay', 'type', 'devices' ] ret_data = {k: v for k, v in req_data.iteritems() if k in key_list} if ret_data['broadband'] not in CarrierList: ret_data['broadband'] = u'其他'.encode('utf-8') return ret_data @except_handler() def post(self, request_data): req_data = MonitorAlertHandler.request_data_format(request_data) es_client.create_record(settings.opt['ELASTICSEARCH']['INDEX'], req_data) trigger_alert = self.alert_handler(req_data) for _trig in trigger_alert: logger.debug('{0}'.format(_trig)) for trig_l in trigger_alert: if trig_l['alert']: ''' 检查是否需要告警,报警邮件发送的两个条件: 用户未确认 在重复周期内 一个告警日志记录由: trigger.id、服务器IP、省份、县市、运营商、app_id,共同识别 ''' ret_alert = self.check_alert( trig_l['trigger'], req_data['server'], req_data['province'], req_data['county'], req_data['broadband'], req_data['app_id'], req_data['time']) if ret_alert == 'no_alert': ''' 之前无告警,需要保存告警信息到告警日志 ''' logger.debug('create_alert_log:{0}, {1}, {2}'.format( trig_l['trigger'].id, req_data['server'], req_data['time'])) content_detail = u'[{}]的线路出现告警,问题为:[{}异常]<br/>' \ u'告警策略详情:<br/>' \ u'指标:{},触发条件:{}{}{},统计周期:{},持续时间:{}<br/>' \ u'告警数值:<br/>' \ u'{}({})'.format( # req_data['time'], # trig_l['region'].name, # req_data['server'], # u'{0}, {1}'.format(req_data['province'], req_data['county']), req_data['broadband'], trig_l['trigger']._item.comment, # {6} # trig_l['trigger']._rule.name, trig_l['trigger']._item.comment, trig_l['trigger'].compare, trig_l['trigger'].threshold, trig_l['trigger']._item.unit, # {10} trig_l['trigger'].period, trig_l['trigger'].time, # {12} ['{:.2f}'.format(i) for i in trig_l['data']], trig_l['trigger']._item.unit, ) instance_ma = MonitorAlertLog( start_time=req_data['time'], trigger='{0}'.format(trig_l['trigger'].id), rule_name=trig_l['trigger']._rule.name, region=trig_l['region'].name, province=req_data['province'].encode('utf8'), county=req_data['county'].encode('utf8'), broadband=req_data['broadband'].encode('utf8'), appid=req_data['app_id'], serverip=req_data['server'], state='open', content=trig_l['trigger']._item.comment.encode('utf8'), content_detail=content_detail.encode('utf8'), ) instance_ma._save(self.db_session) elif ret_alert == 'in_repeat': continue else: pass ret_cfm = self.check_confirm(trig_l['trigger'].id, req_data['server'], req_data['province'], req_data['county'], req_data['broadband'], req_data['app_id']) if ret_cfm: continue message = u'[{0}],您的[{1}]集群下IP为[{2}]的服务器连接[{3}]地区[{4}]的线路出现告警,问题为:\r\n[{5}异常]\r\n' \ u'告警策略详情:\r\n' \ u'名称:{6}, 指标:{7},触发条件:{8}{9}{10},统计周期:{11},持续时间:{12}\r\n' \ u'当前数值:\r\n' \ u'{13}({10})\r\n' \ u'请及时处理,更多详细信息请至控制台查看。'.format( req_data['time'], trig_l['region'].name, req_data['server'], u'{0}, {1}'.format(req_data['province'], req_data['county']), req_data['broadband'], u'{0}'.format(trig_l['trigger']._item.comment), # {6} trig_l['trigger']._rule.name, trig_l['trigger']._item.comment, trig_l['trigger'].compare, trig_l['trigger'].threshold, trig_l['trigger']._item.unit, # {10} trig_l['trigger'].period, trig_l['trigger'].time, # {12} ['{:.2f}'.format(i) for i in trig_l['data']], ) # logger.debug('alert_mail:{0}, {1}, {2}'.format( # trig_l['trigger'].id, req_data['server'], req_data['time'])) for action in trig_l['trigger']._rule._actions: send_mail(action.mail, action.contacter, message) else: ''' 检查是否需要取消告警 ''' ret_alert = self.cancel_alert(trig_l['trigger'].id, req_data['server'], req_data['province'], req_data['county'], req_data['broadband'], req_data['app_id']) if ret_alert: ''' 更新告警日志,保存本次告警信息 发送取消告警消息 ''' al_records = self.db_session.query(MonitorAlertLog).filter( MonitorAlertLog.rule_name == trig_l['trigger']._rule.name, MonitorAlertLog.trigger == '{0}'.format( trig_l['trigger'].id), MonitorAlertLog.serverip == req_data['server'], MonitorAlertLog.province == req_data['province'].encode('utf8'), MonitorAlertLog.county == req_data['county'].encode( 'utf8'), MonitorAlertLog.broadband == req_data['broadband'].encode('utf8'), MonitorAlertLog.appid == req_data['app_id'], or_(MonitorAlertLog.state == 'open', MonitorAlertLog.state == 'confirm'), ).all() for al_record in al_records: al_record.state = 'close' al_record.stop_time = req_data['time'] al_record.content_detail = u'{}<br/>' \ u'线路恢复正常,告警取消。<br/>' \ u'当前数值:<br/>' \ u'{}({})'.format( al_record.content_detail, ['{:.2f}'.format(i) for i in trig_l['data']], trig_l['trigger']._item.unit, ).encode('utf8') al_record._save(self.db_session, update=True, commit=False) self.db_session.commit() message = u'[{0}],您的[{1}]集群下IP为[{2}]的服务器连接[{3}]地区[{4}]的线路恢复正常,原告警问题描述为:\r\n[{5}异常]\r\n' \ u'触发告警策略详情:\r\n' \ u'名称:{6}, 指标:{7},触发条件:{8}{9}{10},统计周期:{11},持续时间:{12}\r\n' \ u'当前数值:\r\n' \ u'{13}({10})\r\n' \ u'更多详细信息请至控制台查看。'.format( req_data['time'], trig_l['region'].name, req_data['server'], u'{0}, {1}'.format(req_data['province'], req_data['county']), req_data['broadband'], u'{0}'.format(trig_l['trigger']._item.comment), # {6} trig_l['trigger']._rule.name, trig_l['trigger']._item.comment, trig_l['trigger'].compare, trig_l['trigger'].threshold, trig_l['trigger']._item.unit, # {10} trig_l['trigger'].period, trig_l['trigger'].time, # {12} ['{:.2f}'.format(i) for i in trig_l['data']], ) # logger.debug('cansel_alert_mail:{0}, {1}, {2}'.format( # trig_l['trigger'].id, req_data['server'], req_data['time'])) for action in trig_l['trigger']._rule._actions: send_mail(action.mail, action.contacter, message) return {'return': 0}