def _do_sensu_checkins(self, service_failed, msg): check_every = ('{minutes}m'.format( minutes=int(self.autoscaler.run_frequency // 60)) if self.autoscaler else DEFAULT_CHECK_EVERY) # magic-y numbers here; an alert will time out after two autoscaler run periods plus a five minute buffer alert_delay = ('{minutes}m'.format( minutes=int(self.autoscaler.run_frequency // 60) * 2 + 5) if self.autoscaler else DEFAULT_TTL) sensu_args = dict( check_name=SERVICE_CHECK_NAME, scheduler=self.options.scheduler, app=self.apps[0], # TODO (CLUSTERMAN-126) check_every=check_every, source=f'{self.options.cluster}_{self.options.pool}', ttl=alert_delay, alert_after=alert_delay, noop=self.options.dry_run, pool=self.options.pool, ) if service_failed: sensu_args[ 'output'] = f'FAILED: clusterman autoscaler failed ({msg})' sensu_args['status'] = Status.CRITICAL else: sensu_args['output'] = 'OK: clusterman autoscaler is fine' sensu_checkin(**sensu_args)
def run(self): while self.running: time.sleep(splay_event_time( self.run_interval, self.get_name() + staticconf.read_string('aws.region'), )) now = arrow.utcnow() with self.metrics_client.get_writer(METADATA) as writer: try: with suppress_request_limit_exceeded(): self.write_prices(now, writer) except socket.timeout: # We don't really care if we miss a few spot price changes so just continue here logger.warn(f'Timed out getting spot prices:\n\n{format_exc()}') continue # Report successful run to Sensu. sensu_args = dict( check_name='check_clusterman_spot_prices_running', output='OK: clusterman spot_prices was successful', check_every='1m', source=self.options.aws_region, ttl='10m', noop=self.options.disable_sensu, ) sensu_checkin(**sensu_args)
def run(self) -> None: self.load_pool_managers( ) # Load the pools on the first run; do it here so we get logging while self.running: time.sleep( splay_event_time( self.run_interval, self.get_name() + self.options.cluster, )) for pool, manager in self.pool_managers.items(): logger.info( f'Reloading state for pool manager for pool {pool}') manager.reload_state() logger.info(f'Done reloading state for pool {pool}') successful = self.write_all_metrics() # Report successful run to Sensu. if successful: sensu_args = dict( check_name='check_clusterman_cluster_metrics_running', output='OK: clusterman cluster_metrics was successful', check_every='1m', source=self.options.cluster, ttl='10m', noop=self.options.disable_sensu, ) sensu_checkin(**sensu_args)
def _get_signal_for_app(self, app: str) -> Signal: """Load the signal object to use for autoscaling for a particular app :param app: the name of the app to load a Signal for :returns: the configured app signal, or the default signal in case of an error """ logger.info( f'Loading autoscaling signal for {app} on {self.pool} in {self.cluster}' ) # TODO (CLUSTERMAN-126, CLUSTERMAN-195) apps will eventually have separate namespaces from pools pool_namespace = POOL_NAMESPACE.format(pool=app, scheduler=self.scheduler) signal_namespace = staticconf.read_string('autoscale_signal.namespace', default=app, namespace=pool_namespace) try: # see if the pool has set up a custom signal correctly; if not, fall back to the default signal return Signal( self.cluster, self.pool, self.scheduler, app, pool_namespace, self.metrics_client, signal_namespace, ) except NoSignalConfiguredException: logger.info( f'No signal configured for {app}, falling back to default') return self.default_signal except Exception: msg = f'WARNING: loading signal for {app} failed, falling back to default' logger.exception(msg) sensu_checkin( check_name=SIGNAL_LOAD_CHECK_NAME, status=Status.WARNING, output=msg, source=self.cluster, scheduler=self.scheduler, page=False, ttl=None, app=app, noop=not self.monitoring_enabled, pool=self.pool, ) return self.default_signal
def test_fallback(self, mock_sensu): sensu_checkin( check_name='my_check', output='output', source='my_source', app='non-existent', ) assert mock_sensu.return_value.send_event.call_args == mock.call( name='my_check', output=self._sensu_output('output', 'my_source', app='non-existent'), source='my_source', status=Status.OK.value, runbook='y/my-runbook', team='my_team', page=True, )
def test_args_overrides_config(self, mock_sensu, app, pool, scheduler): sensu_checkin( check_name='my_check', output='output', source='my_source', team='a_different_team', app=app, pool=pool, scheduler=scheduler, ) assert mock_sensu.return_value.send_event.call_args == mock.call( name='my_check', source='my_source', output=self._sensu_output('output', 'my_source', pool, app, scheduler), status=Status.OK.value, runbook='y/my-runbook' if not app else 'y/their-runbook', team='a_different_team', page=True, )
def test_sensu_checkin(self, mock_sensu, noop): sensu_checkin( check_name='my_check', output='output', source='my_source', noop=noop, ) if noop: assert mock_sensu.return_value.send_event.call_count == 0 else: assert mock_sensu.return_value.send_event.call_args == mock.call( name='my_check', output=self._sensu_output('output', 'my_source'), source='my_source', status=Status.OK.value, runbook='y/my-runbook', team='my_team', page=True, )
def run(self) -> None: self.load_pool_managers( ) # Load the pools on the first run; do it here so we get logging try: # self.running is a property from yelp_batch which checks version_checker if a watcher config has changed. # If so, the entire batch restarts and configs for the service are reloaded. while self.running: time.sleep( splay_event_time( self.run_interval, self.get_name() + self.options.cluster, )) for pool, manager in self.pool_managers.items(): logger.info( f'Reloading state for pool manager for pool {pool}') manager.reload_state() logger.info(f'Done reloading state for pool {pool}') successful = self.write_all_metrics() # Report successful run to Sensu. if successful: sensu_args = dict( check_name='check_clusterman_cluster_metrics_running', output='OK: clusterman cluster_metrics was successful', check_every='1m', source=self.options.cluster, ttl='20m', noop=self.options.disable_sensu, ) sensu_checkin(**sensu_args) except Exception: # yelp_batch doesn't show the whole traceback when something fails self.logger.exception('cluster metrics collector failed')