Exemple #1
0
def check_schedulers(r, schedulers):
    for s in schedulers:
        action('Check scheduler {} .....'.format(s[2:]))
        try:
            ts = r.get("zmon:metrics:{}:ts".format(s))
            if ts is None:
                error("No scheduling loop registered ( running/stuck? )")
                continue

            delta = int(time.time() - float(ts))
            action("... last loop")
            highlight("{}".format(delta))
            action("s ago ...")
            if delta > 300:
                error("Last loop more than 300s ago (stuck? restart?)".format(delta))
                continue

            if delta > 180:
                error("Last loop more than 180s ago (stuck? check logs/watch)".format(delta))
                continue

            action("...")
            ok()
        except Exception as e:
            error(e)
Exemple #2
0
def check_queues(redis):
    queues = ['zmon:queue:default', 'zmon:queue:snmp', 'zmon:queue:internal', 'zmon:queue:secure']

    for q in queues:
        action('Checking queue length ... {} ...'.format(q))
        l = redis.llen(q)
        action("...")
        highlight("{}".format(l))
        action(" ...")
        if l < 2000:
            ok()
            continue
        error("to many tasks")
Exemple #3
0
def check_workers(r, workers):
    for w in workers:
        action('Check worker {} ...'.format(w))
        try:
            ts = r.get("zmon:metrics:{}:ts".format(w))
            delta = time.time() - float(ts)
            delta = max(int(delta), 0)

            action("... last exec")
            highlight("{}".format(delta))
            action("s ago ...")
            if delta < 30:
                ok()
                continue

            error("no task execute recently")

        except Exception as e:
            error(e)