Example #1
0
def __monitor(domain):
    response = http.get(domain=domain, url='/rest/monitor/waitingQueue')
    if response['code'] == '200' and response['data']['selectjob']:
        waiting_apps = len(response['data']['selectjob'])
        if waiting_apps >= __THRESHOLD__:
            logging.warn('%s watting queue serious: %d' %
                         (domain, waiting_apps))

            watchalert.sendAlertToGroups("Portal-DIPScheduler",
                                         "WaittingQueue Serious",
                                         '%s: %d' % (domain, waiting_apps),
                                         '%s: %d' % (domain, waiting_apps),
                                         "DIP_ALL", True, True, False)
Example #2
0
def __monitor(domain):
    now = datetime.now()

    response = http.get(domain=domain, url='/rest/monitor/runningList')
    if response['code'] == '200' and response['data']['selectjob']:
        running_apps = response['data']['selectjob']

        for running_app in running_apps:
            running_app_name = running_app['jobName']
            running_app_execute_time = datetime.strptime(
                running_app['executeTime'], '%b %d, %Y %I:%M:%S %p')

            running_app_time = (now - running_app_execute_time).total_seconds()

            if running_app_time > __TIMEOUT__:
                print "Application {} timeout".format(running_app_name)

                watchalert.sendAlertToGroups(
                    "Portal-DIPScheduler", "RunningList Serious",
                    "Application {} timeout".format(running_app_name), "",
                    "DIP_ALL", True, True, False)
Example #3
0
def report_to_group(subject, content):
    watchalert.sendAlertToGroups("Storm1.1.1", "SupervisorWhetherSurvivor",
                                 subject, content, "DIP_ALL", True, True,
                                 False)
    logging.error(content)
Example #4
0
def report_to_group(subject, content):
    watchalert.sendAlertToGroups("Kafka", "KafkaSurvivalMonitor", subject,
                                 content, "DIP_ALL", True, True, False)
    logging.error(content)
Example #5
0
def report_to_group(subservice, subject, content):
    watchalert.sendAlertToGroups("databus", subservice, subject, content,
                                 "DIP_ALL", True, True, False)
    logging.info(content)
Example #6
0
    parameters = {}

    parameters['states'] = 'RUNNING'

    try:
        apps = yarn_client.get_applications(parameters=parameters)
    except Exception:
        logging.error("get failed apps from yarn error: %s" %
                      traceback.format_exc())

        sys.exit(0)

    appnames = []

    if apps:
        for app in apps:
            appnames.append(app['name'])

    not_running_apps = []

    for APP in __APPS__:
        if APP not in appnames:
            not_running_apps.append(APP)

    if not_running_apps:
        watchalert.sendAlertToGroups("Hadoop-Yarn(Streaming)",
                                     "Application Not Running",
                                     str(not_running_apps),
                                     str(not_running_apps), "DIP_ALL", True,
                                     True, False)
Example #7
0
    rm2 = 'd056081.eos.dip.sina.com.cn:8088'

    timeout = 5

    yarn_client = yarn.YarnClient(rm1, rm2, timeout)

    parameters = {}

    parameters['states'] = 'FAILED'
    parameters['finishedTimeBegin'] = str(times[0])
    parameters['finishedTimeEnd'] = str(times[1])

    try:
        apps = yarn_client.get_applications(parameters=parameters)
    except Exception:
        logging.error("get failed apps from yarn error: %s" %
                      traceback.format_exc())

        sys.exit(0)

    if apps:
        appnames = []

        for app in apps:
            appnames.append(app['name'])

        logging.warn("faild apps: %s" % str(appnames))

        watchalert.sendAlertToGroups(
            "Hadoop-Yarn(Batch)", "Application Falied", str(appnames), str(appnames), "DIP_ALL", True, True, False)
Example #8
0
def report_to_group(subject, content):
    watchalert.sendAlertToGroups("Databus", "Kafka2Hdfs WhetherExist", subject, content, "DIP_ALL", True, True, False)
    logging.error(content)
Example #9
0
    parameters = {}

    parameters['states'] = 'RUNNING'

    try:
        apps = yarn_client.get_applications(parameters=parameters)
    except Exception:
        logging.error("get failed apps from yarn error: %s" %
                      traceback.format_exc())

        sys.exit(0)

    timeout_apps = []

    for app in apps:
        if app['queue'] != 'root.hive':
            continue

        started_time = app['startedTime']
        if now - started_time >= __THRESHOLD__:
            timeout_apps.append(app['name'])

    if timeout_apps:
        logging.info("timeout apps: %s" % str(timeout_apps))

        watchalert.sendAlertToGroups("Hadoop-Yarn(Batch)",
                                     "Application Timeout", str(timeout_apps),
                                     str(timeout_apps), "DIP_ALL", True, True,
                                     False)
Example #10
0
    if topic_name == 'dip-kafka2es-trace':
        return 50000000
    return int(qps) * 50


if __name__ == "__main__":
    zookeeper_ali = 'first.zookeeper.aliyun.dip.weibo.com:2181,second.zookeeper.aliyun.dip.weibo.com:2181,third.zookeeper.aliyun.dip.weibo.com:2181/kafka/k1'
    zookeeper_k1001 = 'first.zookeeper.dip.weibo.com:2181,second.zookeeper.dip.weibo.com:2181,third.zookeeper.dip.weibo.com:2181/kafka/k1001'

    consumer_list = []
    try:
        consumer_list = mysql_client.get_consumers_info()
    except:
        subject = 'mysqlclient unable fetch data'
        content = 'mysqlclient unable fetch data \n %s' % traceback.format_exc(
        )
        logging.error(content)
        watchalert.sendAlertToGroups("Kafka", "KafkaClientAuto ConsumerOffset",
                                     subject, content, "DIP_ALL", True, True,
                                     False)
    for element in consumer_list:
        topic_name = element['topic_name']
        consumer_group = element['consumer_group']
        contact_person = element['contact_person']
        qps = element['qps']

        if topic_name == 'app_weibomobilekafka1234_weibomobileaction26':
            continue
        threshold = get_threshold(qps, topic_name)
        KafkaClient(zookeeper_k1001, consumer_group, topic_name, threshold,
                    contact_person).consumer_offset_checker()
Example #11
0
def report_to_group(msg, detail_msg):
    watchalert.sendAlertToGroups( "salt", "salt minion", msg, detail_msg, "DIP_ALL", True, True, False)
Example #12
0
        cursor = conn.cursor()

        cursor.execute(sql)

        failed_apps = []

        for failed_app in cursor.fetchall():
            failed_apps.append(failed_app[0])

        if failed_apps:
            logging.error("failed apps: %s" % str(failed_apps))

            watchalert.sendAlertToGroups("Portal-SelectJob",
                                         "SelectJob Falied",
                                         str(len(failed_apps)),
                                         str(failed_apps), "DIP_ALL", True,
                                         True, False)
    except Exception:
        logging.error("failed apps monitor error: %s" % traceback.format_exc())
    finally:
        if cursor:
            try:
                cursor.close()
            except expression as identifier:
                pass

        if conn:
            try:
                conn.close()
            except expression as identifier: