def __monitor(domain): response = http.get(domain=domain, url='/rest/monitor/waitingQueue') if response['code'] == '200' and response['data']['selectjob']: waiting_apps = len(response['data']['selectjob']) if waiting_apps >= __THRESHOLD__: logging.warn('%s watting queue serious: %d' % (domain, waiting_apps)) watchalert.sendAlertToGroups("Portal-DIPScheduler", "WaittingQueue Serious", '%s: %d' % (domain, waiting_apps), '%s: %d' % (domain, waiting_apps), "DIP_ALL", True, True, False)
def __monitor(domain): now = datetime.now() response = http.get(domain=domain, url='/rest/monitor/runningList') if response['code'] == '200' and response['data']['selectjob']: running_apps = response['data']['selectjob'] for running_app in running_apps: running_app_name = running_app['jobName'] running_app_execute_time = datetime.strptime( running_app['executeTime'], '%b %d, %Y %I:%M:%S %p') running_app_time = (now - running_app_execute_time).total_seconds() if running_app_time > __TIMEOUT__: print "Application {} timeout".format(running_app_name) watchalert.sendAlertToGroups( "Portal-DIPScheduler", "RunningList Serious", "Application {} timeout".format(running_app_name), "", "DIP_ALL", True, True, False)
def report_to_group(subject, content): watchalert.sendAlertToGroups("Storm1.1.1", "SupervisorWhetherSurvivor", subject, content, "DIP_ALL", True, True, False) logging.error(content)
def report_to_group(subject, content): watchalert.sendAlertToGroups("Kafka", "KafkaSurvivalMonitor", subject, content, "DIP_ALL", True, True, False) logging.error(content)
def report_to_group(subservice, subject, content): watchalert.sendAlertToGroups("databus", subservice, subject, content, "DIP_ALL", True, True, False) logging.info(content)
parameters = {} parameters['states'] = 'RUNNING' try: apps = yarn_client.get_applications(parameters=parameters) except Exception: logging.error("get failed apps from yarn error: %s" % traceback.format_exc()) sys.exit(0) appnames = [] if apps: for app in apps: appnames.append(app['name']) not_running_apps = [] for APP in __APPS__: if APP not in appnames: not_running_apps.append(APP) if not_running_apps: watchalert.sendAlertToGroups("Hadoop-Yarn(Streaming)", "Application Not Running", str(not_running_apps), str(not_running_apps), "DIP_ALL", True, True, False)
rm2 = 'd056081.eos.dip.sina.com.cn:8088' timeout = 5 yarn_client = yarn.YarnClient(rm1, rm2, timeout) parameters = {} parameters['states'] = 'FAILED' parameters['finishedTimeBegin'] = str(times[0]) parameters['finishedTimeEnd'] = str(times[1]) try: apps = yarn_client.get_applications(parameters=parameters) except Exception: logging.error("get failed apps from yarn error: %s" % traceback.format_exc()) sys.exit(0) if apps: appnames = [] for app in apps: appnames.append(app['name']) logging.warn("faild apps: %s" % str(appnames)) watchalert.sendAlertToGroups( "Hadoop-Yarn(Batch)", "Application Falied", str(appnames), str(appnames), "DIP_ALL", True, True, False)
def report_to_group(subject, content): watchalert.sendAlertToGroups("Databus", "Kafka2Hdfs WhetherExist", subject, content, "DIP_ALL", True, True, False) logging.error(content)
parameters = {} parameters['states'] = 'RUNNING' try: apps = yarn_client.get_applications(parameters=parameters) except Exception: logging.error("get failed apps from yarn error: %s" % traceback.format_exc()) sys.exit(0) timeout_apps = [] for app in apps: if app['queue'] != 'root.hive': continue started_time = app['startedTime'] if now - started_time >= __THRESHOLD__: timeout_apps.append(app['name']) if timeout_apps: logging.info("timeout apps: %s" % str(timeout_apps)) watchalert.sendAlertToGroups("Hadoop-Yarn(Batch)", "Application Timeout", str(timeout_apps), str(timeout_apps), "DIP_ALL", True, True, False)
if topic_name == 'dip-kafka2es-trace': return 50000000 return int(qps) * 50 if __name__ == "__main__": zookeeper_ali = 'first.zookeeper.aliyun.dip.weibo.com:2181,second.zookeeper.aliyun.dip.weibo.com:2181,third.zookeeper.aliyun.dip.weibo.com:2181/kafka/k1' zookeeper_k1001 = 'first.zookeeper.dip.weibo.com:2181,second.zookeeper.dip.weibo.com:2181,third.zookeeper.dip.weibo.com:2181/kafka/k1001' consumer_list = [] try: consumer_list = mysql_client.get_consumers_info() except: subject = 'mysqlclient unable fetch data' content = 'mysqlclient unable fetch data \n %s' % traceback.format_exc( ) logging.error(content) watchalert.sendAlertToGroups("Kafka", "KafkaClientAuto ConsumerOffset", subject, content, "DIP_ALL", True, True, False) for element in consumer_list: topic_name = element['topic_name'] consumer_group = element['consumer_group'] contact_person = element['contact_person'] qps = element['qps'] if topic_name == 'app_weibomobilekafka1234_weibomobileaction26': continue threshold = get_threshold(qps, topic_name) KafkaClient(zookeeper_k1001, consumer_group, topic_name, threshold, contact_person).consumer_offset_checker()
def report_to_group(msg, detail_msg): watchalert.sendAlertToGroups( "salt", "salt minion", msg, detail_msg, "DIP_ALL", True, True, False)
cursor = conn.cursor() cursor.execute(sql) failed_apps = [] for failed_app in cursor.fetchall(): failed_apps.append(failed_app[0]) if failed_apps: logging.error("failed apps: %s" % str(failed_apps)) watchalert.sendAlertToGroups("Portal-SelectJob", "SelectJob Falied", str(len(failed_apps)), str(failed_apps), "DIP_ALL", True, True, False) except Exception: logging.error("failed apps monitor error: %s" % traceback.format_exc()) finally: if cursor: try: cursor.close() except expression as identifier: pass if conn: try: conn.close() except expression as identifier: