Beispiel #1
0
 def dispatch(self, host, names):
     machine = get_machine_from_uuid(host)
     if not machine:
         log.error("machine not found, wtf!")
         return
     multihandler = MultiHandler(host)
     metrics = []
     for name in names:
         target = statname(host, name)
         metric = multihandler.decorate_target(target)
         if metric['alias'].rfind("%(head)s.") == 0:
             metric['alias'] = metric['alias'][9:]
         plugin = metric['alias'].split('.')[0]
         if plugin not in self.ignore_plugins:
             metrics.append(metric)
     if not metrics:
         return
     log.info("New metrics for host %s, notifying core: %s", host, metrics)
     payload = {
         'uuid': host,
         'collectd_password': machine.collectd_password,
         'metrics': metrics,
     }
     try:
         resp = requests.post(
             "%s/new_metrics" % mon_config.CORE_URI,
             data=json.dumps(payload),
             verify=mon_config.SSL_VERIFY
         )
     except Exception as exc:
         log.error("Error notifying core: %r", exc)
         return
     if not resp.ok:
         log.error("Bad response from core: %s", resp.text)
     # also save to file in disk
     if self.fh is not None:
         try:
             for name in names:
                 self.fh.write("%s %s\n" % (host, name))
             self.fh.flush()
         except IOError as exc:
             log.error("Error writing to metrics file: %s", exc)
Beispiel #2
0
def check_machine(machine, rule_id=''):
    """Check all conditions for given machine with a single graphite query.

    If rule is specified, on that rule will be checked.

    """

    old_targets = {
        'cpu': 'cpu.total.nonidle',
        'load': 'load.shortterm',
        'ram': 'memory.nonfree_percent',
        'disk-read': 'disk.total.disk_octets.read',
        'disk-write': 'disk.total.disk_octets.write',
        'network-rx': 'interface.total.if_octets.rx',
        'network-tx': 'interface.total.if_octets.tx',
    }

    handler = MultiHandler(machine.uuid)

    # check if machine activated
    if not machine.activated:
        if handler.check_head():
            log.info("%s just got activated after %s", machine.uuid,
                     tdelta_to_str(time() - machine.enabled_time))
            with machine.lock_n_load():
                machine.activated = True
                machine.save()
                for rule_id in machine.rules:
                    condition = machine.get_condition(rule_id)
                    condition.active_after = time() + 30
                    condition.save()
        else:
            log.info("%s not activated since %s", machine.uuid,
                     tdelta_to_str(time() - machine.enabled_time))
        return

    # gather all conditions
    conditions = {}
    rules = [rule_id] if rule_id else machine.rules
    for rule_id in rules:
        lbl = "%s/%s" % (machine.uuid, rule_id)
        try:
            condition = machine.get_condition(rule_id)
        except ConditionNotFoundError:
            log.warning(
                "%s condition not found, probably rule just got "
                "updated, will check on next run", lbl)
            continue
        lbl = "%s [%s]" % (lbl, condition)
        target = old_targets.get(condition.metric, condition.metric)
        ## if "%(head)s." not in target:
        ## target = "%(head)s." + target
        if condition.operator not in ('gt', 'lt'):
            log.error("%s unknown operator '%s'", lbl, condition.operator)
            continue
        if not condition.aggregate:
            log.warning("%s setting aggregate to 'all'", lbl)
            condition.aggregate = 'all'
            condition.save()
        if condition.aggregate not in ('all', 'any', 'avg'):
            log.error("%s unknown aggregate '%s'", lbl, condition.aggregate)
            continue
        if condition.active_after > time():
            log.info("%s not yet active", lbl)
            continue
        if target not in conditions:
            conditions[target] = [condition]
        else:
            conditions[target].append(condition)
    if not conditions:
        log.warning("%s no rules found", machine.uuid)
        return

    try:
        data = handler.get_data(conditions.keys(), start='-90sec')
    except GraphiteError as exc:
        log.warning("%s error fetching stats %r", machine.uuid, exc)
        return

    # check all conditions
    for item in data:
        target = item['_requested_target']
        if target not in conditions:
            log.warning("%s get data returned unexpected target %s",
                        machine.uuid, target)
            continue
        datapoints = [(val, ts) for val, ts in item['datapoints']
                      if val is not None]
        for condition in conditions.pop(target):
            if not datapoints:
                log.warning("%s/%s [%s] no data for rule", machine.uuid,
                            condition.rule_id, condition)
                continue
            check_condition(condition, datapoints)

    if conditions:
        for target in conditions:
            for cond in conditions[target]:
                if target == "nodata":
                    # if nodata rule didn't return any datapoints, the whisper
                    # files must be missing, so make the rule true
                    check_condition(cond, [(1, 0)])
                else:
                    log.warning("%s/%s [%s] target not found for rule",
                                machine.uuid, cond.rule_id, cond)
Beispiel #3
0
def check_machine(machine, rule_id=""):
    """Check all conditions for given machine with a single graphite query.

    If rule is specified, on that rule will be checked.

    """

    old_targets = {
        "cpu": "cpu.total.nonidle",
        "load": "load.shortterm",
        "ram": "memory.nonfree_percent",
        "disk-read": "disk.total.disk_octets.read",
        "disk-write": "disk.total.disk_octets.write",
        "network-rx": "interface.total.if_octets.rx",
        "network-tx": "interface.total.if_octets.tx",
    }

    handler = MultiHandler(machine.uuid)

    # check if machine activated
    if not machine.activated:
        if handler.check_head():
            log.info("%s just got activated after %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time))
            with machine.lock_n_load():
                machine.activated = True
                machine.save()
                for rule_id in machine.rules:
                    condition = machine.get_condition(rule_id)
                    condition.active_after = time() + 30
                    condition.save()
        else:
            log.info("%s not activated since %s", machine.uuid, tdelta_to_str(time() - machine.enabled_time))
        return

    # gather all conditions
    conditions = {}
    rules = [rule_id] if rule_id else machine.rules
    for rule_id in rules:
        lbl = "%s/%s" % (machine.uuid, rule_id)
        try:
            condition = machine.get_condition(rule_id)
        except ConditionNotFoundError:
            log.warning("%s condition not found, probably rule just got " "updated, will check on next run", lbl)
            continue
        lbl = "%s [%s]" % (lbl, condition)
        target = old_targets.get(condition.metric, condition.metric)
        ## if "%(head)s." not in target:
        ## target = "%(head)s." + target
        if condition.operator not in ("gt", "lt"):
            log.error("%s unknown operator '%s'", lbl, condition.operator)
            continue
        if not condition.aggregate:
            log.warning("%s setting aggregate to 'all'", lbl)
            condition.aggregate = "all"
            condition.save()
        if condition.aggregate not in ("all", "any", "avg"):
            log.error("%s unknown aggregate '%s'", lbl, condition.aggregate)
            continue
        if condition.active_after > time():
            log.info("%s not yet active", lbl)
            continue
        if target not in conditions:
            conditions[target] = [condition]
        else:
            conditions[target].append(condition)
    if not conditions:
        log.warning("%s no rules found", machine.uuid)
        return

    try:
        data = handler.get_data(conditions.keys(), start="-90sec")
    except GraphiteError as exc:
        log.warning("%s error fetching stats %r", machine.uuid, exc)
        return

    # check all conditions
    for item in data:
        target = item["_requested_target"]
        if target not in conditions:
            log.warning("%s get data returned unexpected target %s", machine.uuid, target)
            continue
        datapoints = [(val, ts) for val, ts in item["datapoints"] if val is not None]
        for condition in conditions.pop(target):
            if not datapoints:
                log.warning("%s/%s [%s] no data for rule", machine.uuid, condition.rule_id, condition)
                continue
            check_condition(condition, datapoints)

    if conditions:
        for target in conditions:
            for cond in conditions[target]:
                if target == "nodata":
                    # if nodata rule didn't return any datapoints, the whisper
                    # files must be missing, so make the rule true
                    check_condition(cond, [(1, 0)])
                else:
                    log.warning("%s/%s [%s] target not found for rule", machine.uuid, cond.rule_id, cond)