Exemple #1
0
    def _do_sensu_checkins(self, service_failed, msg):
        check_every = ('{minutes}m'.format(
            minutes=int(self.autoscaler.run_frequency //
                        60)) if self.autoscaler else DEFAULT_CHECK_EVERY)
        # magic-y numbers here; an alert will time out after two autoscaler run periods plus a five minute buffer
        alert_delay = ('{minutes}m'.format(
            minutes=int(self.autoscaler.run_frequency // 60) * 2 +
            5) if self.autoscaler else DEFAULT_TTL)

        sensu_args = dict(
            check_name=SERVICE_CHECK_NAME,
            scheduler=self.options.scheduler,
            app=self.apps[0],  # TODO (CLUSTERMAN-126)
            check_every=check_every,
            source=f'{self.options.cluster}_{self.options.pool}',
            ttl=alert_delay,
            alert_after=alert_delay,
            noop=self.options.dry_run,
            pool=self.options.pool,
        )

        if service_failed:
            sensu_args[
                'output'] = f'FAILED: clusterman autoscaler failed ({msg})'
            sensu_args['status'] = Status.CRITICAL
        else:
            sensu_args['output'] = 'OK: clusterman autoscaler is fine'
        sensu_checkin(**sensu_args)
    def run(self):
        while self.running:
            time.sleep(splay_event_time(
                self.run_interval,
                self.get_name() + staticconf.read_string('aws.region'),
            ))

            now = arrow.utcnow()
            with self.metrics_client.get_writer(METADATA) as writer:
                try:
                    with suppress_request_limit_exceeded():
                        self.write_prices(now, writer)
                except socket.timeout:
                    # We don't really care if we miss a few spot price changes so just continue here
                    logger.warn(f'Timed out getting spot prices:\n\n{format_exc()}')
                    continue

            # Report successful run to Sensu.
            sensu_args = dict(
                check_name='check_clusterman_spot_prices_running',
                output='OK: clusterman spot_prices was successful',
                check_every='1m',
                source=self.options.aws_region,
                ttl='10m',
                noop=self.options.disable_sensu,
            )
            sensu_checkin(**sensu_args)
    def run(self) -> None:
        self.load_pool_managers(
        )  # Load the pools on the first run; do it here so we get logging

        while self.running:
            time.sleep(
                splay_event_time(
                    self.run_interval,
                    self.get_name() + self.options.cluster,
                ))

            for pool, manager in self.pool_managers.items():
                logger.info(
                    f'Reloading state for pool manager for pool {pool}')
                manager.reload_state()
                logger.info(f'Done reloading state for pool {pool}')

            successful = self.write_all_metrics()

            # Report successful run to Sensu.
            if successful:
                sensu_args = dict(
                    check_name='check_clusterman_cluster_metrics_running',
                    output='OK: clusterman cluster_metrics was successful',
                    check_every='1m',
                    source=self.options.cluster,
                    ttl='10m',
                    noop=self.options.disable_sensu,
                )
                sensu_checkin(**sensu_args)
Exemple #4
0
    def _get_signal_for_app(self, app: str) -> Signal:
        """Load the signal object to use for autoscaling for a particular app

        :param app: the name of the app to load a Signal for
        :returns: the configured app signal, or the default signal in case of an error
        """
        logger.info(
            f'Loading autoscaling signal for {app} on {self.pool} in {self.cluster}'
        )

        # TODO (CLUSTERMAN-126, CLUSTERMAN-195) apps will eventually have separate namespaces from pools
        pool_namespace = POOL_NAMESPACE.format(pool=app,
                                               scheduler=self.scheduler)
        signal_namespace = staticconf.read_string('autoscale_signal.namespace',
                                                  default=app,
                                                  namespace=pool_namespace)

        try:
            # see if the pool has set up a custom signal correctly; if not, fall back to the default signal
            return Signal(
                self.cluster,
                self.pool,
                self.scheduler,
                app,
                pool_namespace,
                self.metrics_client,
                signal_namespace,
            )
        except NoSignalConfiguredException:
            logger.info(
                f'No signal configured for {app}, falling back to default')
            return self.default_signal
        except Exception:
            msg = f'WARNING: loading signal for {app} failed, falling back to default'
            logger.exception(msg)
            sensu_checkin(
                check_name=SIGNAL_LOAD_CHECK_NAME,
                status=Status.WARNING,
                output=msg,
                source=self.cluster,
                scheduler=self.scheduler,
                page=False,
                ttl=None,
                app=app,
                noop=not self.monitoring_enabled,
                pool=self.pool,
            )
            return self.default_signal
Exemple #5
0
 def test_fallback(self, mock_sensu):
     sensu_checkin(
         check_name='my_check',
         output='output',
         source='my_source',
         app='non-existent',
     )
     assert mock_sensu.return_value.send_event.call_args == mock.call(
         name='my_check',
         output=self._sensu_output('output', 'my_source', app='non-existent'),
         source='my_source',
         status=Status.OK.value,
         runbook='y/my-runbook',
         team='my_team',
         page=True,
     )
Exemple #6
0
    def test_args_overrides_config(self, mock_sensu, app, pool, scheduler):
        sensu_checkin(
            check_name='my_check',
            output='output',
            source='my_source',
            team='a_different_team',
            app=app,
            pool=pool,
            scheduler=scheduler,
        )

        assert mock_sensu.return_value.send_event.call_args == mock.call(
            name='my_check',
            source='my_source',
            output=self._sensu_output('output', 'my_source', pool, app, scheduler),
            status=Status.OK.value,
            runbook='y/my-runbook' if not app else 'y/their-runbook',
            team='a_different_team',
            page=True,
        )
Exemple #7
0
    def test_sensu_checkin(self, mock_sensu, noop):
        sensu_checkin(
            check_name='my_check',
            output='output',
            source='my_source',
            noop=noop,
        )

        if noop:
            assert mock_sensu.return_value.send_event.call_count == 0
        else:
            assert mock_sensu.return_value.send_event.call_args == mock.call(
                name='my_check',
                output=self._sensu_output('output', 'my_source'),
                source='my_source',
                status=Status.OK.value,
                runbook='y/my-runbook',
                team='my_team',
                page=True,
            )
    def run(self) -> None:
        self.load_pool_managers(
        )  # Load the pools on the first run; do it here so we get logging

        try:
            # self.running is a property from yelp_batch which checks version_checker if a watcher config has changed.
            # If so, the entire batch restarts and configs for the service are reloaded.
            while self.running:
                time.sleep(
                    splay_event_time(
                        self.run_interval,
                        self.get_name() + self.options.cluster,
                    ))

                for pool, manager in self.pool_managers.items():
                    logger.info(
                        f'Reloading state for pool manager for pool {pool}')
                    manager.reload_state()
                    logger.info(f'Done reloading state for pool {pool}')

                successful = self.write_all_metrics()

                # Report successful run to Sensu.
                if successful:
                    sensu_args = dict(
                        check_name='check_clusterman_cluster_metrics_running',
                        output='OK: clusterman cluster_metrics was successful',
                        check_every='1m',
                        source=self.options.cluster,
                        ttl='20m',
                        noop=self.options.disable_sensu,
                    )
                    sensu_checkin(**sensu_args)

        except Exception:
            # yelp_batch doesn't show the whole traceback when something fails
            self.logger.exception('cluster metrics collector failed')