Ejemplo n.º 1
0
def triggered(request):
    """
    Tags: rules
    ---
    Process a trigger sent by the alert service.

    Based on the parameters of the request, this method will initiate actions
    to mitigate the conditions that triggered the rule and notify the users.

    ---

    value:
     type: integer
     required: true
     description: >
       the value that triggered the rule by exceeding the threshold
    incident:
     type: string
     required: true
     description: the incident's UUID
    resource:
     type: string
     required: true
     description: the UUID of the resource for which the rule got triggered
    triggered:
     type: integer
     required: true
     description: 0 if the specified incident got resolved/untriggered
    triggered_now:
     type: integer
     required: true
     description: |
       0 in case this is not the first time the specified incident has
       raised an alert
    firing_since:
     type: string
     required: true
     description: |
       the time at which the rule raised an alert and sent a trigger to
       this API endpoint
    pending_since:
     type: string
     required: true
     description: |
       the time at which the rule evaluated to True and entered pending
       state. A rule can remain in pending state if a TriggerOffset has
       been configured. Datetime needed
    resolved_since:
     type: string
     required: true
     description: >
       the time at which the incident with the specified UUID resolved.\
       Datetime needed

    """
    # Do not publicly expose this API endpoint?
    if config.CILIA_SECRET_KEY != request.headers.get('Cilia-Secret-Key'):
        raise UnauthorizedError()

    params = params_from_request(request)

    keys = (
        'value',
        'incident',
        'resource',
        'triggered',
        'triggered_now',
        'firing_since',
        'pending_since',
        'resolved_since',
    )
    for key in keys:
        if key not in params:
            raise RequiredParameterMissingError(key)

    # Get the rule's UUID.
    # TODO rule_id = request.matchdict['rule']
    rule_id = params['rule_id']

    # Get resource and incidents ids.
    incident_id = str(params['incident'])
    resource_id = str(params['resource'])

    # Get timestamps.
    firing_since = str(params['firing_since'])
    # pending_since = str(params['pending_since'])
    resolved_since = str(params['resolved_since'])

    try:
        value = params['value']
        value = float(value)
    except (TypeError, ValueError) as err:
        log.error('Failed to cast "%s" to float: %r', value, err)
        raise BadRequestError('Failed to convert %s to float' % value)

    def int_to_bool(param):
        try:
            return bool(int(param or 0))
        except (ValueError, TypeError) as err:
            log.error('Failed to cast int to bool: %r', err)
            raise BadRequestError('Failed to convert %s to boolean' % param)

    # Get flags indicating whether the incident has been (just) triggered.
    triggered = int_to_bool(params['triggered'])
    triggered_now = int_to_bool(params['triggered_now'])

    try:
        machine = Machine.objects.get(id=resource_id)  # missing_since=None?
    except Machine.DoesNotExist:
        raise NotFoundError('Machine with id %s does not exist' % resource_id)

    try:
        machine.cloud.owner
    except AttributeError:
        raise NotFoundError('Machine with id %s does not exist' % resource_id)

    if machine.cloud.deleted:
        raise NotFoundError('Machine with id %s does not exist' % resource_id)

    if machine.missing_since:
        raise NotFoundError('Machine with id %s does not exist' % resource_id)

    if machine.state == 'terminated':
        raise NotFoundError('Machine with id %s is terminated' % resource_id)

    if not machine.monitoring.hasmonitoring:
        raise NotFoundError('%s does not have monitoring enabled' % machine)

    try:
        rule = Rule.objects.get(id=rule_id, owner_id=machine.owner.id)
    except Rule.DoesNotExist:
        raise NotFoundError('Rule with id %s does not exist' % rule_id)

    # FIXME For backwards compatibility.
    try:
        timestamp = resolved_since or firing_since
        timestamp = int(get_datetime(timestamp).strftime('%s'))
    except ValueError as err:
        log.error('Failed to cast datetime obj to unix timestamp: %r', err)
        raise BadRequestError(err)
    if triggered_now or not triggered:
        notification_level = 0
    else:
        import time
        notification_level = int((time.time() - timestamp) /
                                 rule.frequency.timedelta.total_seconds())
    # /

    rule_triggered(machine,
                   rule.title,
                   value,
                   triggered,
                   timestamp,
                   notification_level,
                   incident_id=incident_id)
    return Response('OK', 200)
Ejemplo n.º 2
0
    def list_machines(self):
        """Return list of machines for cloud

        A list of nodes is fetched from libcloud, the data is processed, stored
        on machine models, and a list of machine models is returned.

        Subclasses SHOULD NOT override or extend this method.

        There are instead a number of methods that are called from this method,
        to allow subclasses to modify the data according to the specific of
        their cloud type. These methods currently are:

            `self._list_machines__fetch_machines`
            `self._list_machines__machine_actions`
            `self._list_machines__postparse_machine`
            `self._list_machines__cost_machine`
            `self._list_machines__fetch_generic_machines`

        Subclasses that require special handling should override these, by
        default, dummy methods.

        """

        # Try to query list of machines from provider API.
        try:
            nodes = self._list_machines__fetch_machines()
            log.info("List nodes returned %d results for %s.",
                     len(nodes), self.cloud)
        except InvalidCredsError as exc:
            log.warning("Invalid creds on running list_nodes on %s: %s",
                        self.cloud, exc)
            raise CloudUnauthorizedError(msg=exc.message)
        except ssl.SSLError as exc:
            log.error("SSLError on running list_nodes on %s: %s",
                      self.cloud, exc)
            raise SSLError(exc=exc)
        except Exception as exc:
            log.exception("Error while running list_nodes on %s", self.cloud)
            raise CloudUnavailableError(exc=exc)

        machines = []
        now = datetime.datetime.utcnow()

        # Process each machine in returned list.
        # Store previously unseen machines separately.
        new_machines = []
        for node in nodes:

            # Fetch machine mongoengine model from db, or initialize one.
            try:
                machine = Machine.objects.get(cloud=self.cloud,
                                              machine_id=node.id)
            except Machine.DoesNotExist:
                machine = Machine(cloud=self.cloud, machine_id=node.id).save()
                new_machines.append(machine)

            # Update machine_model's last_seen fields.
            machine.last_seen = now
            machine.missing_since = None

            # Get misc libcloud metadata.
            image_id = str(node.image or node.extra.get('imageId') or
                           node.extra.get('image_id') or
                           node.extra.get('image') or '')
            size = (node.size or node.extra.get('flavorId') or
                    node.extra.get('instancetype'))

            machine.name = node.name
            machine.image_id = image_id
            machine.size = size
            machine.state = config.STATES[node.state]
            machine.private_ips = node.private_ips
            machine.public_ips = node.public_ips

            # Set machine extra dict.
            # Make sure we don't meet any surprises when we try to json encode
            # later on in the HTTP response.
            extra = self._list_machines__get_machine_extra(machine, node)

            for key, val in extra.items():
                try:
                    json.dumps(val)
                except TypeError:
                    extra[key] = str(val)
            machine.extra = extra

            # Set machine hostname
            if machine.extra.get('dns_name'):
                machine.hostname = machine.extra['dns_name']
            else:
                ips = machine.public_ips + machine.private_ips
                if not ips:
                    ips = []
                for ip in ips:
                    if ip and ':' not in ip:
                        machine.hostname = ip
                        break

            # Get machine tags from db
            tags = {tag.key: tag.value for tag in Tag.objects(
                owner=self.cloud.owner, resource=machine,
            ).only('key', 'value')}

            # Get machine creation date.
            try:
                created = self._list_machines__machine_creation_date(machine,
                                                                     node)
                if created:
                    machine.created = get_datetime(created)
            except Exception as exc:
                log.exception("Error finding creation date for %s in %s.",
                              self.cloud, machine)
            # TODO: Consider if we should fall back to using current date.
            # if not machine_model.created:
            #     machine_model.created = datetime.datetime.utcnow()

            # Update with available machine actions.
            try:
                self._list_machines__machine_actions(machine, node)
            except Exception as exc:
                log.exception("Error while finding machine actions "
                              "for machine %s:%s for %s",
                              machine.id, node.name, self.cloud)

            # Apply any cloud/provider specific post processing.
            try:
                self._list_machines__postparse_machine(machine, node)
            except Exception as exc:
                log.exception("Error while post parsing machine %s:%s for %s",
                              machine.id, node.name, self.cloud)

            # Apply any cloud/provider cost reporting.
            try:
                def parse_num(num):
                    try:
                        return float(num or 0)
                    except (ValueError, TypeError):
                        log.warning("Can't parse %r as float.", num)
                        return 0

                month_days = calendar.monthrange(now.year, now.month)[1]

                cph = parse_num(tags.get('cost_per_hour'))
                cpm = parse_num(tags.get('cost_per_month'))
                if not (cph or cpm) or cph > 100 or cpm > 100 * 24 * 31:
                    cph, cpm = map(parse_num,
                                   self._list_machines__cost_machine(machine,
                                                                     node))
                if not cph:
                    cph = float(cpm) / month_days / 24
                elif not cpm:
                    cpm = cph * 24 * month_days
                machine.cost.hourly = cph
                machine.cost.monthly = cpm

            except Exception as exc:
                log.exception("Error while calculating cost "
                              "for machine %s:%s for %s",
                              machine.id, node.name, self.cloud)
            if node.state.lower() == 'terminated':
                machine.cost.hourly = 0
                machine.cost.monthly = 0

            # Save all changes to machine model on the database.
            try:
                machine.save()
            except me.ValidationError as exc:
                log.error("Error adding %s: %s", machine.name, exc.to_dict())
                raise BadRequestError({"msg": exc.message,
                                       "errors": exc.to_dict()})
            except me.NotUniqueError as exc:
                log.error("Machine %s not unique error: %s", machine.name, exc)
                raise ConflictError("Machine with this name already exists")

            machines.append(machine)

        # Append generic-type machines, which aren't handled by libcloud.
        for machine in self._list_machines__fetch_generic_machines():
            machine.last_seen = now
            machine.missing_since = None
            machine.state = config.STATES[NodeState.UNKNOWN]
            for action in ('start', 'stop', 'reboot', 'destroy', 'rename',
                           'resume', 'suspend', 'undefine'):
                setattr(machine.actions, action, False)
            machine.actions.tag = True
            # allow reboot action for bare metal with key associated
            if machine.key_associations:
                machine.actions.reboot = True
            machine.save()
            machines.append(machine)

        # Set last_seen on machine models we didn't see for the first time now.
        Machine.objects(cloud=self.cloud,
                        id__nin=[m.id for m in machines],
                        missing_since=None).update(missing_since=now)

        # Update RBAC Mappings given the list of nodes seen for the first time.
        self.cloud.owner.mapper.update(new_machines)

        # Update machine counts on cloud and org.
        # FIXME: resolve circular import issues
        from mist.api.clouds.models import Cloud
        self.cloud.machine_count = len(machines)
        self.cloud.save()
        self.cloud.owner.total_machine_count = sum(
            cloud.machine_count for cloud in Cloud.objects(
                owner=self.cloud.owner, deleted=None
            ).only('machine_count')
        )
        self.cloud.owner.save()

        # Close libcloud connection
        try:
            self.disconnect()
        except Exception as exc:
            log.warning("Error while closing connection: %r", exc)

        return machines
Ejemplo n.º 3
0
def triggered(request):
    """
    Tags: rules
    ---
    Process a trigger sent by the alert service.

    Based on the parameters of the request, this method will initiate actions
    to mitigate the conditions that triggered the rule and notify the users.

    ---

    value:
     type: integer
     required: true
     description: >
       the value that triggered the rule by exceeding the threshold
    incident:
     type: string
     required: true
     description: the incident's UUID
    resource:
     type: string
     required: true
     description: the UUID of the resource for which the rule got triggered
    triggered:
     type: integer
     required: true
     description: 0 if the specified incident got resolved/untriggered
    triggered_now:
     type: integer
     required: true
     description: |
       0 in case this is not the first time the specified incident has
       raised an alert
    firing_since:
     type: string
     required: true
     description: |
       the time at which the rule raised an alert and sent a trigger to
       this API endpoint
    pending_since:
     type: string
     required: true
     description: |
       the time at which the rule evaluated to True and entered pending
       state. A rule can remain in pending state if a TriggerOffset has
       been configured. Datetime needed
    resolved_since:
     type: string
     required: true
     description: >
       the time at which the incident with the specified UUID resolved.\
       Datetime needed

    """
    # Do not publicly expose this API endpoint?
    if config.CILIA_SECRET_KEY != request.headers.get('Cilia-Secret-Key'):
        raise UnauthorizedError()

    params = params_from_request(request)

    keys = (
        'value',
        'incident',
        'triggered',
        'triggered_now',
        'firing_since',
        'pending_since',
        'resolved_since',
    )
    for key in keys:
        if key not in params:
            raise RequiredParameterMissingError(key)

    # Get the rule's UUID.
    # TODO rule_id = request.matchdict['rule']
    rule_id = params['rule_id']

    # Get resource and incidents ids.
    incident_id = str(params['incident'])
    resource_id = str(params['resource'])

    # Get timestamps.
    firing_since = str(params['firing_since'])
    # pending_since = str(params['pending_since'])
    resolved_since = str(params['resolved_since'])

    try:
        value = params['value']
        value = float(value)
    except (TypeError, ValueError) as err:
        log.error('Failed to cast "%s" to float: %r', value, err)
        raise BadRequestError('Failed to convert %s to float' % value)

    def int_to_bool(param):
        try:
            return bool(int(param or 0))
        except (ValueError, TypeError) as err:
            log.error('Failed to cast int to bool: %r', err)
            raise BadRequestError('Failed to convert %s to boolean' % param)

    # Get flags indicating whether the incident has been (just) triggered.
    triggered = int_to_bool(params['triggered'])
    triggered_now = int_to_bool(params['triggered_now'])

    # Get the timestamp at which the rule's state changed.
    try:
        timestamp = resolved_since or firing_since
        timestamp = int(get_datetime(timestamp).strftime('%s'))
    except ValueError as err:
        log.error('Failed to cast datetime obj to unix timestamp: %r', err)
        raise BadRequestError(err)

    try:
        rule = Rule.objects.get(id=rule_id)
    except Rule.DoesNotExist:
        raise RuleNotFoundError()

    # Validate resource, if the rule is resource-bound.
    if not rule.is_arbitrary():
        resource_type = rule.resource_model_name
        Model = get_resource_model(resource_type)
        try:
            resource = Model.objects.get(id=resource_id, owner=rule.owner_id)
        except Model.DoesNotExist:
            raise NotFoundError('%s %s' % (resource_type, resource_id))
        if is_resource_missing(resource):
            raise NotFoundError('%s %s' % (resource_type, resource_id))
    else:
        resource_type = resource_id = None

    # Record the trigger, if it's a no-data, to refer to it later.
    if isinstance(rule, NoDataRule):
        if triggered:
            NoDataRuleTracker.add(rule.id, resource.id)
        else:
            NoDataRuleTracker.remove(rule.id, resource.id)
    # Run chain of rule's actions.
    run_chained_actions(
        rule.id,
        incident_id,
        resource_id,
        resource_type,
        value,
        triggered,
        triggered_now,
        timestamp,
    )
    return Response('OK', 200)