def nginx_tcp_check_job(count=3): now = datetime.datetime.now() if now.hour == 7: c.redis_instance.delete(c.nginx_tcp_alarm_key) if count <= 0 : c.logger.error(datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S') + ' nginx_tcp_check retry 3 times') sms(mobile_list=c.mobile_list, message_post='nginx_tcp_check retry 3 times') return need_alarm = False msg = '' try: print c.ha_nginx_check_url r = requests.get(c.ha_nginx_check_url, timeout=2) r.raise_for_status() s = html.fromstring(r.text, None, parser=html.HTMLParser(remove_blank_text=True)) ip_list = s.xpath('//table/tr/td[2]') state_list = s.xpath('//table/tr/td[3]') tcp_check_status = [] i = 0 for ip in ip_list: tcp_check_status.append({'ip':ip.text_content(), 'state':state_list[i].text_content()}) i = i + 1 for s in tcp_check_status: msg = '%s|%s:%s' % (msg, s['ip'], s['state']) if s['state'] != 'up': need_alarm = True except Exception, e: now = datetime.datetime.now() c.logger.error('%s %s' % (str(datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S')), str(e))) count = count - 1 nginx_tcp_check_job(count);
def rabbitmq_queue_alarm_job(): queue = {'log': None, 'purify': None, 'upload': None, 'download': None, 'encode': None, 'entry': None, 'store': None} try: # lines should be only one row lines = read_file("/tmp/rabbitmq_queue.o") if not lines: c.logger.error("/tmp/rabbitmq_queue.o is blank") info = lines[0] # info = r"encode:|entry:|purify:|download:|log:|upload:|store:" item_list = info.split('|') if not item_list: c.logger.error("queue_info parse error") for item in item_list: tmp = item.split(':') if tmp[1] and tmp[1] != '\n': queue[tmp[0]] = int(tmp[1]) error_q = [] for q in queue.keys(): if q == 'log': if queue[q] is None or queue[q] >= 128: error_q.append("%s:%s" % (q, queue[q])) else: if queue[q] is None or queue[q] >= 32: error_q.append("%s:%s" % (q, queue[q])) if error_q: time = get_date_and_time()[1] content = '|'.join(error_q) msg = '[%s] %s' % (time, content) sms(mobile_list=c.mobile_list, message_post=msg) except Exception, e: c.logger.error(e)
def read_alarm_job(): start_time = datetime.datetime.now() - datetime.timedelta(minutes=36) read_failure_data = AppAvailableData.objects.filter(name='read', result=False, time__gte=start_time).values('time') failure_count = len(read_failure_data) if failure_count >= c.read_alarm_time: try: start_time = read_failure_data[0]['time'] end_time = read_failure_data[failure_count - 1]['time'] type = 'read_bookmark' time = get_date_and_time()[1] msg = 'read failure count:%s,%s' % (str(failure_count), time) sms(mobile_list=c.mobile_list, message_post=msg) # 获取上一次的报警信息 latest = SysAlarm.objects.filter(type=type).order_by('-gmt_create') if latest: latest = latest[0] # start_time是本次报警的开始时间 # latest.end_time是上次报警的结束时间 # 所有在算时间差的时候, start_time应该是时间差的end, latest.end_time应该是时间差的start tdiff = timediff(latest.end_time, start_time) if (not latest) or start_time > latest.end_time and tdiff > 600: # 一次新的故障 alarm = SysAlarm(type=type, start_time=start_time, end_time=end_time) alarm.save() else: # 添加-报警的定时任务是每10min执行一次, 如果本次报警的时间和上次开始的时间<10min, 则认为是同一次故障 # 同一次故障的持续报警, 只需修改上次的结束时间即可 latest.end_time = end_time latest.save() else: # 以前没有过这样类型的故障 alarm = SysAlarm(type=type, start_time=start_time, end_time=end_time) alarm.save() except Exception, e: c.logger.error(e)
for s in tcp_check_status: msg = '%s|%s:%s' % (msg, s['ip'], s['state']) if s['state'] != 'up': need_alarm = True except Exception, e: now = datetime.datetime.now() c.logger.error('%s %s' % (str(datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S')), str(e))) count = count - 1 nginx_tcp_check_job(count); else: print 'msg:', msg if need_alarm: s = c.redis_instance.incr(c.nginx_tcp_alarm_key) if s <= c.max_nginx_tcp_alarm: sms(mobile_list=c.mobile_list, message_post=msg) c.logger.error('%s %s' % (str(datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S')), msg)) @print_info(name='web_alarm_job') def web_alarm_job(): kan_url = 'http://kan.sohu.com/' reader_url = 'http://kan.sohu.com/reader/' MAX_ALARM_COUNT = 8 now = datetime.datetime.now() now_str = datetime.datetime.strftime(now, '%Y-%m-%d %H:%M:%S') if int(now.hour) % 2 == 0: c.redis_instance.delete(c.web_alarm_key) #if count <= 0 :