def __init__( self, cluster: str, pool: str, scheduler: str, fetch_state: bool = True, ) -> None: self.cluster = cluster self.pool = pool self.scheduler = scheduler self.cluster_connector = ClusterConnector.load(self.cluster, self.pool, self.scheduler) self.pool_config = staticconf.NamespaceReaders( POOL_NAMESPACE.format(pool=self.pool, scheduler=self.scheduler)) self.draining_enabled = self.pool_config.read_bool('draining_enabled', default=False) self.draining_client: Optional[DrainingClient] = DrainingClient( cluster) if self.draining_enabled else None self.min_capacity = self.pool_config.read_int( 'scaling_limits.min_capacity') self.max_capacity = self.pool_config.read_int( 'scaling_limits.max_capacity') self.max_tasks_to_kill = read_int_or_inf( self.pool_config, 'scaling_limits.max_tasks_to_kill') self.max_weight_to_add = self.pool_config.read_int( 'scaling_limits.max_weight_to_add') self.max_weight_to_remove = self.pool_config.read_int( 'scaling_limits.max_weight_to_remove') if fetch_state: self.reload_state()
def process_warning_queue(self) -> None: host_to_process = self.get_warned_host() if host_to_process: logger.info( f'Processing spot warning for {host_to_process.hostname}') spot_fleet_resource_groups = [] for pool in get_pool_name_list( self.cluster, 'mesos'): # draining only supported for Mesos clusters pool_config = staticconf.NamespaceReaders( POOL_NAMESPACE.format(pool=pool, scheduler='mesos')) for resource_group_conf in pool_config.read_list( 'resource_groups'): spot_fleet_resource_groups.extend( list( SpotFleetResourceGroup.load( cluster=self.cluster, pool=pool, config=list(resource_group_conf.values())[0], ).keys())) # we should definitely ignore termination warnings that aren't from this # cluster or maybe not even paasta instances... if host_to_process.group_id in spot_fleet_resource_groups: logger.info( f'Sending spot warned host to drain: {host_to_process.hostname}' ) self.submit_host_for_draining(host_to_process) else: logger.info( f'Ignoring spot warned host because not in our SFRs: {host_to_process.hostname}' ) self.delete_warning_messages([host_to_process])
def mock_autoscaler(): autoscaling_config_dict = { 'default_signal_role': 'clusterman', 'setpoint': 0.7, 'target_capacity_margin': 0.1, } with mock.patch( 'clusterman.autoscaler.autoscaler.ClustermanMetricsBotoClient', autospec=True, ), mock.patch( 'clusterman.autoscaler.autoscaler.PoolManager', autospec=True, ), mock.patch( 'clusterman.autoscaler.autoscaler.Autoscaler._get_signal_for_app', autospec=True, ), mock.patch( 'clusterman.autoscaler.autoscaler.get_monitoring_client', ), mock.patch( 'clusterman.autoscaler.autoscaler.ExternalSignal', ), mock.patch( 'clusterman.autoscaler.autoscaler.PendingPodsSignal', ), staticconf.testing.PatchConfiguration( {'autoscaling': autoscaling_config_dict}, ): mock_autoscaler = Autoscaler('mesos-test', 'bar', 'mesos', ['bar'], monitoring_enabled=False) mock_autoscaler.pool_manager.cluster_connector = mock.Mock() mock_autoscaler.pool_manager.target_capacity = 300 mock_autoscaler.pool_manager.min_capacity = staticconf.read_int( 'scaling_limits.min_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos') ) mock_autoscaler.pool_manager.max_capacity = staticconf.read_int( 'scaling_limits.max_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos') ) mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0 mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol) mock_autoscaler.non_orphan_capacity_gauge = mock.Mock(spec=GaugeProtocol) mock_autoscaler.resource_request_gauges = { 'mem': mock.Mock(spec=GaugeProtocol), 'cpus': mock.Mock(spec=GaugeProtocol), 'disk': mock.Mock(spec=GaugeProtocol), 'gpus': mock.Mock(spec=GaugeProtocol), } return mock_autoscaler
def test_load_cluster_pool_config(cluster, pool, pool_other_config, mock_config_files): config.load_cluster_pool_config(cluster, pool, 'mesos', None) pool_namespace = POOL_NAMESPACE.format(pool=pool, scheduler='mesos') assert staticconf.read_int('other_config', namespace=pool_namespace) == pool_other_config assert staticconf.read_string(f'resource_groups', namespace=pool_namespace) == cluster
def get_target_capacity_value(target_capacity: str, pool: str, scheduler: str) -> int: target_capacity = target_capacity.lower() pool_namespace = POOL_NAMESPACE.format(pool=pool, scheduler=scheduler) if target_capacity == 'min': return staticconf.read_int('scaling_limits.min_capacity', namespace=pool_namespace) elif target_capacity == 'max': return staticconf.read_int('scaling_limits.max_capacity', namespace=pool_namespace) else: return int(target_capacity)
def test_allocation_with_excluded_pods(mock_cluster_connector, daemonset_pod): with PatchConfiguration({'exclude_daemonset_pods': True}, namespace=POOL_NAMESPACE.format( pool=mock_cluster_connector.pool, scheduler=mock_cluster_connector.SCHEDULER)): mock_cluster_connector.reload_state() assert daemonset_pod not in mock_cluster_connector._pods_by_ip[ '10.10.10.2'] assert mock_cluster_connector.get_resource_total('cpus') == 10 assert mock_cluster_connector.get_resource_allocation('cpus') == 6
def pool_configs(): with staticconf.testing.PatchConfiguration( { 'scaling_limits': { 'min_capacity': 24, 'max_capacity': 5000, 'max_weight_to_add': 200, 'max_weight_to_remove': 10, }, }, namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos'), ): yield
def mock_config_namespaces(): # To avoid polluting staticconf for other tests, and clear out stuff from conftest that mocks configuration with staticconf.testing.MockConfiguration( {}, namespace=POOL_NAMESPACE.format(pool='pool-1', scheduler='mesos'), ), staticconf.testing.MockConfiguration( {}, namespace=POOL_NAMESPACE.format(pool='pool-2', scheduler='mesos'), ), staticconf.testing.MockConfiguration( { 'clusters': { 'cluster-A': { 'mesos_url_api': 'service.leader', 'aws_region': 'us-test-3', }, }, 'aws': { 'access_key_file': '/etc/no_cfg/clusterman.json', } }, namespace=staticconf.config.DEFAULT, ): yield
def _get_signal_for_app(self, app: str) -> Signal: """Load the signal object to use for autoscaling for a particular app :param app: the name of the app to load a Signal for :returns: the configured app signal, or the default signal in case of an error """ logger.info( f'Loading autoscaling signal for {app} on {self.pool} in {self.cluster}' ) # TODO (CLUSTERMAN-126, CLUSTERMAN-195) apps will eventually have separate namespaces from pools pool_namespace = POOL_NAMESPACE.format(pool=app, scheduler=self.scheduler) signal_namespace = staticconf.read_string('autoscale_signal.namespace', default=app, namespace=pool_namespace) try: # see if the pool has set up a custom signal correctly; if not, fall back to the default signal return Signal( self.cluster, self.pool, self.scheduler, app, pool_namespace, self.metrics_client, signal_namespace, ) except NoSignalConfiguredException: logger.info( f'No signal configured for {app}, falling back to default') return self.default_signal except Exception: msg = f'WARNING: loading signal for {app} failed, falling back to default' logger.exception(msg) sensu_checkin( check_name=SIGNAL_LOAD_CHECK_NAME, status=Status.WARNING, output=msg, source=self.cluster, scheduler=self.scheduler, page=False, ttl=None, app=app, noop=not self.monitoring_enabled, pool=self.pool, ) return self.default_signal
def setup_signals_environment(pool: str, scheduler: str) -> Tuple[int, int]: app_namespace = POOL_NAMESPACE.format(pool=pool, scheduler=scheduler) default_signal_version = staticconf.read_string( 'autoscale_signal.branch_or_tag') signal_versions = [default_signal_version] signal_namespaces = [ staticconf.read_string('autoscaling.default_signal_role') ] signal_names = [staticconf.read_string('autoscale_signal.name')] app_names = ['__default__'] app_signal_name = staticconf.read_string( 'autoscale_signal.name', namespace=app_namespace, default=None, ) if app_signal_name: signal_names.append(app_signal_name) signal_versions.append( staticconf.read_string( 'autoscale_signal.branch_or_tag', namespace=app_namespace, default=pool, )) signal_namespaces.append( staticconf.read_string('autoscale_signal.namespace', namespace=app_namespace, default=pool), ) app_names.append(pool) versions_to_fetch = set(signal_versions) os.environ['CMAN_VERSIONS_TO_FETCH'] = ' '.join(versions_to_fetch) os.environ['CMAN_SIGNAL_VERSIONS'] = ' '.join(signal_versions) os.environ['CMAN_SIGNAL_NAMESPACES'] = ' '.join(signal_namespaces) os.environ['CMAN_SIGNAL_NAMES'] = ' '.join(signal_names) os.environ['CMAN_SIGNAL_APPS'] = ' '.join(app_names) os.environ['CMAN_NUM_VERSIONS'] = str(len(versions_to_fetch)) os.environ['CMAN_NUM_SIGNALS'] = str(len(signal_versions)) os.environ['CMAN_SIGNALS_BUCKET'] = staticconf.read_string( 'aws.signals_bucket', default=DEFAULT_SIGNALS_BUCKET) return len(versions_to_fetch), len(signal_versions)
def __init__(self, cluster: str, pool: str) -> None: self.cluster = cluster self.pool = pool self.pool_config = staticconf.NamespaceReaders( POOL_NAMESPACE.format(pool=self.pool, scheduler=self.SCHEDULER))
def __init__( self, cluster: str, pool: str, scheduler: str, apps: List[str], pool_manager: Optional[PoolManager] = None, metrics_client: Optional[ClustermanMetricsBotoClient] = None, monitoring_enabled: bool = True, ) -> None: """ Class containing the core logic for autoscaling a cluster :param cluster: the name of the cluster to autoscale :param pool: the name of the pool to autoscale :param apps: a list of apps running on the pool :param pool_manager: a PoolManager object (used for simulations) :param metrics_client: a ClustermanMetricsBotoClient object (used for simulations) :param monitoring_enabled: set to False to disable sensu alerts during scaling """ self.cluster = cluster self.pool = pool self.scheduler = scheduler self.apps = apps self.monitoring_enabled = monitoring_enabled # TODO: handle multiple apps in the autoscaler (CLUSTERMAN-126) if len(self.apps) > 1: raise NotImplementedError( 'Scaling multiple apps in a cluster is not yet supported') logger.info( f'Initializing autoscaler engine for {self.pool} in {self.cluster}...' ) gauge_dimensions = {'cluster': cluster, 'pool': pool} monitoring_client = get_monitoring_client() self.target_capacity_gauge = monitoring_client.create_gauge( TARGET_CAPACITY_GAUGE_NAME, gauge_dimensions) self.resource_request_gauges: Dict[str, Any] = {} for resource in ('cpus', 'mem', 'disk'): self.resource_request_gauges[ resource] = monitoring_client.create_gauge( RESOURCE_GAUGE_BASE_NAME.format(resource=resource), gauge_dimensions, ) self.autoscaling_config = get_autoscaling_config( POOL_NAMESPACE.format(pool=self.pool, scheduler=self.scheduler), ) self.pool_manager = pool_manager or PoolManager( self.cluster, self.pool, self.scheduler) self.mesos_region = staticconf.read_string('aws.region') self.metrics_client = metrics_client or ClustermanMetricsBotoClient( self.mesos_region) self.default_signal = Signal( self.cluster, self.pool, self.scheduler, '__default__', DEFAULT_NAMESPACE, self.metrics_client, signal_namespace=staticconf.read_string( 'autoscaling.default_signal_role'), ) self.signal = self._get_signal_for_app(self.apps[0]) logger.info('Initialization complete')
def sensu_checkin( *, check_name: str, output: str, source: str, status: Status = Status.OK, app: Optional[str] = None, pool: Optional[str] = None, scheduler: Optional[str] = None, noop: bool = False, page: bool = True, **kwargs: Any, ) -> None: # This function feels like a massive hack, let's revisit and see if we can make it better (CLUSTERMAN-304) # # TODO (CLUSTERMAN-126) right now there's only one app per pool so use the global pool namespace # We assume the "pool" name and the "app" name are the same # # Use 'no-namespace' instead of None so we don't skip the per-cluster override pool_namespace = POOL_NAMESPACE.format( pool=app, scheduler=scheduler) if app else 'no-namespace' # read the sensu configuration from srv-configs; signals are not required to define this, so in the case # that they do not define anything, we fall back to the clusterman config. The clusterman config can override # alerts on a per-cluster basis, so first check there; if nothing is defined there, fall back to the default, # which is required to be defined, so we know that someone is going to get the notification # sensu_config = dict( staticconf.read_list('sensu_config', default=[{}], namespace=pool_namespace).pop()) if not sensu_config: sensu_config = dict( staticconf.read_list(f'clusters.{source}.sensu_config', default=[{}]).pop()) if not sensu_config: sensu_config = dict(staticconf.read_list('sensu_config').pop()) # If we've turned off paging in the config, we don't want this function to ever page config_page = sensu_config.pop('page', None) page = False if config_page is False else page # So we know where alerts are coming from precisely output += ''.join([ '\n\nThis check came from:\n', f'- Cluster/region: {source}\n', f'- Pool: {pool}.{scheduler}\n' if pool else '', f'- App: {app}\n' if app else '', ]) sensu_config.update({ 'name': check_name, 'output': output, 'source': source, 'status': status.value, 'page': page, }) # values passed in to this function override config file values (is this really correct??) sensu_config.update(kwargs) pysensu_yelp = _get_sensu() if noop or not pysensu_yelp: logger.info(('Would have sent this event to Sensu:\n' f'{pprint.pformat(sensu_config)}')) return # team and runbook are required entries in srv-configs, so we know this will go to the "right" place pysensu_yelp.send_event(**sensu_config)