def add_machine(uuid, password, update_collectd=True): """Adds machine to monitored list and inform collectd of new machine.""" if not uuid: raise RequiredParameterMissingError("uuid") if not password: raise RequiredParameterMissingError("password") machine = get_machine_from_uuid(uuid) if machine: ## raise MachineExistsError(uuid) with machine.lock_n_load(): machine.collectd_password = password machine.enabled_time = time() machine.save() else: machine = Machine() machine.uuid = uuid machine.collectd_password = password machine.enabled_time = time() machine.create() # add uuid/passwd in collectd.passwd if update_collectd: update_collectd_conf() # add no-data rule add_rule(machine.uuid, "nodata", "nodata", "gt", 0)
def add_rule(uuid, rule_id, metric, operator, value, aggregate="all", reminder_list=None, reminder_offset=0, active_after=30): """Add or update a rule.""" if aggregate not in ('all', 'any', 'avg'): raise BadRequestError("Param 'aggregate' must be in " "('all', 'any', 'avg').") machine = get_machine_from_uuid(uuid) if not machine: raise MachineNotFoundError(uuid) # create new condition condition = Condition() condition.uuid = uuid condition.rule_id = rule_id condition.cond_id = get_rand_token() condition.active_after = time() + active_after condition.metric = metric condition.operator = operator condition.aggregate = aggregate condition.value = value # reminder_list should be a list of integers (notifications after rule # being triggered in seconds). If not provided, default will be used. if reminder_list: condition.reminder_list = reminder_list condition.reminder_offset = reminder_offset # we set notification level to 1 so that new rules that are not satisfied # don't send an OK to core immediately after creation condition.notification_level = 1 # TODO: verify target is valid condition.create() with machine.lock_n_load(): # if rule doesn't exist, create it if rule_id not in machine.rules: rule = Rule() machine.rules[rule_id] = rule rule = machine.rules[rule_id] # if rule had an associated condition, remove it if rule.warning: old_condition = machine.get_condition(rule_id) old_condition.delete() # associate new condition with rule rule.warning = condition.cond_id machine.save()
def remove_rule(uuid, rule_id): """Remove a rule from a machine (along with its associated condition).""" machine = get_machine_from_uuid(uuid) if not machine: raise MachineNotFoundError(uuid) with machine.lock_n_load(): if not rule_id in machine.rules: raise RuleNotFoundError(rule_id) # delete associated condition condition = machine.get_condition(rule_id) condition.delete() # delete rule del machine.rules[rule_id] machine.save()
def dispatch(self, host, names): machine = get_machine_from_uuid(host) if not machine: log.error("machine not found, wtf!") return multihandler = MultiHandler(host) metrics = [] for name in names: target = statname(host, name) metric = multihandler.decorate_target(target) if metric['alias'].rfind("%(head)s.") == 0: metric['alias'] = metric['alias'][9:] plugin = metric['alias'].split('.')[0] if plugin not in self.ignore_plugins: metrics.append(metric) if not metrics: return log.info("New metrics for host %s, notifying core: %s", host, metrics) payload = { 'uuid': host, 'collectd_password': machine.collectd_password, 'metrics': metrics, } try: resp = requests.post( "%s/new_metrics" % mon_config.CORE_URI, data=json.dumps(payload), verify=mon_config.SSL_VERIFY ) except Exception as exc: log.error("Error notifying core: %r", exc) return if not resp.ok: log.error("Bad response from core: %s", resp.text) # also save to file in disk if self.fh is not None: try: for name in names: self.fh.write("%s %s\n" % (host, name)) self.fh.flush() except IOError as exc: log.error("Error writing to metrics file: %s", exc)
def remove_machine(uuid): """Removes a machine from monitored list and from collectd's conf files.""" if not uuid: raise RequiredParameterMissingError("uuid") machine = get_machine_from_uuid(uuid) if not machine: raise MachineNotFoundError(uuid) for rule_id in machine.rules: try: remove_rule(uuid, rule_id) except: log.error("Error removing rule '%s'.", rule_id) machine.delete() # reconstruct collectd passwords file to remove uuid/passwd update_collectd_conf()