Example #1
0
    def __init__(
        self,
        cluster: str,
        pool: str,
        scheduler: str,
        fetch_state: bool = True,
    ) -> None:
        self.cluster = cluster
        self.pool = pool
        self.scheduler = scheduler
        self.cluster_connector = ClusterConnector.load(self.cluster, self.pool,
                                                       self.scheduler)
        self.pool_config = staticconf.NamespaceReaders(
            POOL_NAMESPACE.format(pool=self.pool, scheduler=self.scheduler))

        self.draining_enabled = self.pool_config.read_bool('draining_enabled',
                                                           default=False)
        self.draining_client: Optional[DrainingClient] = DrainingClient(
            cluster) if self.draining_enabled else None
        self.min_capacity = self.pool_config.read_int(
            'scaling_limits.min_capacity')
        self.max_capacity = self.pool_config.read_int(
            'scaling_limits.max_capacity')
        self.max_tasks_to_kill = read_int_or_inf(
            self.pool_config, 'scaling_limits.max_tasks_to_kill')
        self.max_weight_to_add = self.pool_config.read_int(
            'scaling_limits.max_weight_to_add')
        self.max_weight_to_remove = self.pool_config.read_int(
            'scaling_limits.max_weight_to_remove')

        if fetch_state:
            self.reload_state()
Example #2
0
    def process_warning_queue(self) -> None:
        host_to_process = self.get_warned_host()
        if host_to_process:
            logger.info(
                f'Processing spot warning for {host_to_process.hostname}')
            spot_fleet_resource_groups = []
            for pool in get_pool_name_list(
                    self.cluster,
                    'mesos'):  # draining only supported for Mesos clusters
                pool_config = staticconf.NamespaceReaders(
                    POOL_NAMESPACE.format(pool=pool, scheduler='mesos'))
                for resource_group_conf in pool_config.read_list(
                        'resource_groups'):
                    spot_fleet_resource_groups.extend(
                        list(
                            SpotFleetResourceGroup.load(
                                cluster=self.cluster,
                                pool=pool,
                                config=list(resource_group_conf.values())[0],
                            ).keys()))

            # we should definitely ignore termination warnings that aren't from this
            # cluster or maybe not even paasta instances...
            if host_to_process.group_id in spot_fleet_resource_groups:
                logger.info(
                    f'Sending spot warned host to drain: {host_to_process.hostname}'
                )
                self.submit_host_for_draining(host_to_process)
            else:
                logger.info(
                    f'Ignoring spot warned host because not in our SFRs: {host_to_process.hostname}'
                )
            self.delete_warning_messages([host_to_process])
Example #3
0
def mock_autoscaler():
    autoscaling_config_dict = {
        'default_signal_role': 'clusterman',
        'setpoint': 0.7,
        'target_capacity_margin': 0.1,
    }

    with mock.patch(
        'clusterman.autoscaler.autoscaler.ClustermanMetricsBotoClient',
        autospec=True,
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.PoolManager',
        autospec=True,
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.Autoscaler._get_signal_for_app',
        autospec=True,
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.get_monitoring_client',
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.ExternalSignal',
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.PendingPodsSignal',
    ), staticconf.testing.PatchConfiguration(
        {'autoscaling': autoscaling_config_dict},
    ):
        mock_autoscaler = Autoscaler('mesos-test', 'bar', 'mesos', ['bar'], monitoring_enabled=False)
        mock_autoscaler.pool_manager.cluster_connector = mock.Mock()

    mock_autoscaler.pool_manager.target_capacity = 300
    mock_autoscaler.pool_manager.min_capacity = staticconf.read_int(
        'scaling_limits.min_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos')
    )
    mock_autoscaler.pool_manager.max_capacity = staticconf.read_int(
        'scaling_limits.max_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos')
    )
    mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0

    mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol)
    mock_autoscaler.non_orphan_capacity_gauge = mock.Mock(spec=GaugeProtocol)
    mock_autoscaler.resource_request_gauges = {
        'mem': mock.Mock(spec=GaugeProtocol),
        'cpus': mock.Mock(spec=GaugeProtocol),
        'disk': mock.Mock(spec=GaugeProtocol),
        'gpus': mock.Mock(spec=GaugeProtocol),
    }
    return mock_autoscaler
Example #4
0
def test_load_cluster_pool_config(cluster, pool, pool_other_config,
                                  mock_config_files):
    config.load_cluster_pool_config(cluster, pool, 'mesos', None)

    pool_namespace = POOL_NAMESPACE.format(pool=pool, scheduler='mesos')
    assert staticconf.read_int('other_config',
                               namespace=pool_namespace) == pool_other_config
    assert staticconf.read_string(f'resource_groups',
                                  namespace=pool_namespace) == cluster
Example #5
0
def get_target_capacity_value(target_capacity: str, pool: str, scheduler: str) -> int:
    target_capacity = target_capacity.lower()
    pool_namespace = POOL_NAMESPACE.format(pool=pool, scheduler=scheduler)
    if target_capacity == 'min':
        return staticconf.read_int('scaling_limits.min_capacity', namespace=pool_namespace)
    elif target_capacity == 'max':
        return staticconf.read_int('scaling_limits.max_capacity', namespace=pool_namespace)
    else:
        return int(target_capacity)
def test_allocation_with_excluded_pods(mock_cluster_connector, daemonset_pod):
    with PatchConfiguration({'exclude_daemonset_pods': True},
                            namespace=POOL_NAMESPACE.format(
                                pool=mock_cluster_connector.pool,
                                scheduler=mock_cluster_connector.SCHEDULER)):
        mock_cluster_connector.reload_state()
        assert daemonset_pod not in mock_cluster_connector._pods_by_ip[
            '10.10.10.2']
        assert mock_cluster_connector.get_resource_total('cpus') == 10
        assert mock_cluster_connector.get_resource_allocation('cpus') == 6
Example #7
0
def pool_configs():
    with staticconf.testing.PatchConfiguration(
        {
            'scaling_limits': {
                'min_capacity': 24,
                'max_capacity': 5000,
                'max_weight_to_add': 200,
                'max_weight_to_remove': 10,
            },
        },
            namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos'),
    ):
        yield
Example #8
0
def mock_config_namespaces():
    # To avoid polluting staticconf for other tests, and clear out stuff from conftest that mocks configuration
    with staticconf.testing.MockConfiguration(
        {},
            namespace=POOL_NAMESPACE.format(pool='pool-1', scheduler='mesos'),
    ), staticconf.testing.MockConfiguration(
        {},
            namespace=POOL_NAMESPACE.format(pool='pool-2', scheduler='mesos'),
    ), staticconf.testing.MockConfiguration(
        {
            'clusters': {
                'cluster-A': {
                    'mesos_url_api': 'service.leader',
                    'aws_region': 'us-test-3',
                },
            },
            'aws': {
                'access_key_file': '/etc/no_cfg/clusterman.json',
            }
        },
            namespace=staticconf.config.DEFAULT,
    ):
        yield
Example #9
0
    def _get_signal_for_app(self, app: str) -> Signal:
        """Load the signal object to use for autoscaling for a particular app

        :param app: the name of the app to load a Signal for
        :returns: the configured app signal, or the default signal in case of an error
        """
        logger.info(
            f'Loading autoscaling signal for {app} on {self.pool} in {self.cluster}'
        )

        # TODO (CLUSTERMAN-126, CLUSTERMAN-195) apps will eventually have separate namespaces from pools
        pool_namespace = POOL_NAMESPACE.format(pool=app,
                                               scheduler=self.scheduler)
        signal_namespace = staticconf.read_string('autoscale_signal.namespace',
                                                  default=app,
                                                  namespace=pool_namespace)

        try:
            # see if the pool has set up a custom signal correctly; if not, fall back to the default signal
            return Signal(
                self.cluster,
                self.pool,
                self.scheduler,
                app,
                pool_namespace,
                self.metrics_client,
                signal_namespace,
            )
        except NoSignalConfiguredException:
            logger.info(
                f'No signal configured for {app}, falling back to default')
            return self.default_signal
        except Exception:
            msg = f'WARNING: loading signal for {app} failed, falling back to default'
            logger.exception(msg)
            sensu_checkin(
                check_name=SIGNAL_LOAD_CHECK_NAME,
                status=Status.WARNING,
                output=msg,
                source=self.cluster,
                scheduler=self.scheduler,
                page=False,
                ttl=None,
                app=app,
                noop=not self.monitoring_enabled,
                pool=self.pool,
            )
            return self.default_signal
Example #10
0
def setup_signals_environment(pool: str, scheduler: str) -> Tuple[int, int]:
    app_namespace = POOL_NAMESPACE.format(pool=pool, scheduler=scheduler)
    default_signal_version = staticconf.read_string(
        'autoscale_signal.branch_or_tag')
    signal_versions = [default_signal_version]
    signal_namespaces = [
        staticconf.read_string('autoscaling.default_signal_role')
    ]
    signal_names = [staticconf.read_string('autoscale_signal.name')]
    app_names = ['__default__']

    app_signal_name = staticconf.read_string(
        'autoscale_signal.name',
        namespace=app_namespace,
        default=None,
    )
    if app_signal_name:
        signal_names.append(app_signal_name)
        signal_versions.append(
            staticconf.read_string(
                'autoscale_signal.branch_or_tag',
                namespace=app_namespace,
                default=pool,
            ))
        signal_namespaces.append(
            staticconf.read_string('autoscale_signal.namespace',
                                   namespace=app_namespace,
                                   default=pool), )
        app_names.append(pool)

    versions_to_fetch = set(signal_versions)
    os.environ['CMAN_VERSIONS_TO_FETCH'] = ' '.join(versions_to_fetch)
    os.environ['CMAN_SIGNAL_VERSIONS'] = ' '.join(signal_versions)
    os.environ['CMAN_SIGNAL_NAMESPACES'] = ' '.join(signal_namespaces)
    os.environ['CMAN_SIGNAL_NAMES'] = ' '.join(signal_names)
    os.environ['CMAN_SIGNAL_APPS'] = ' '.join(app_names)
    os.environ['CMAN_NUM_VERSIONS'] = str(len(versions_to_fetch))
    os.environ['CMAN_NUM_SIGNALS'] = str(len(signal_versions))
    os.environ['CMAN_SIGNALS_BUCKET'] = staticconf.read_string(
        'aws.signals_bucket', default=DEFAULT_SIGNALS_BUCKET)

    return len(versions_to_fetch), len(signal_versions)
Example #11
0
 def __init__(self, cluster: str, pool: str) -> None:
     self.cluster = cluster
     self.pool = pool
     self.pool_config = staticconf.NamespaceReaders(
         POOL_NAMESPACE.format(pool=self.pool, scheduler=self.SCHEDULER))
Example #12
0
    def __init__(
        self,
        cluster: str,
        pool: str,
        scheduler: str,
        apps: List[str],
        pool_manager: Optional[PoolManager] = None,
        metrics_client: Optional[ClustermanMetricsBotoClient] = None,
        monitoring_enabled: bool = True,
    ) -> None:
        """ Class containing the core logic for autoscaling a cluster

        :param cluster: the name of the cluster to autoscale
        :param pool: the name of the pool to autoscale
        :param apps: a list of apps running on the pool
        :param pool_manager: a PoolManager object (used for simulations)
        :param metrics_client: a ClustermanMetricsBotoClient object (used for simulations)
        :param monitoring_enabled: set to False to disable sensu alerts during scaling
        """
        self.cluster = cluster
        self.pool = pool
        self.scheduler = scheduler
        self.apps = apps
        self.monitoring_enabled = monitoring_enabled

        # TODO: handle multiple apps in the autoscaler (CLUSTERMAN-126)
        if len(self.apps) > 1:
            raise NotImplementedError(
                'Scaling multiple apps in a cluster is not yet supported')

        logger.info(
            f'Initializing autoscaler engine for {self.pool} in {self.cluster}...'
        )

        gauge_dimensions = {'cluster': cluster, 'pool': pool}
        monitoring_client = get_monitoring_client()
        self.target_capacity_gauge = monitoring_client.create_gauge(
            TARGET_CAPACITY_GAUGE_NAME, gauge_dimensions)
        self.resource_request_gauges: Dict[str, Any] = {}
        for resource in ('cpus', 'mem', 'disk'):
            self.resource_request_gauges[
                resource] = monitoring_client.create_gauge(
                    RESOURCE_GAUGE_BASE_NAME.format(resource=resource),
                    gauge_dimensions,
                )

        self.autoscaling_config = get_autoscaling_config(
            POOL_NAMESPACE.format(pool=self.pool, scheduler=self.scheduler), )
        self.pool_manager = pool_manager or PoolManager(
            self.cluster, self.pool, self.scheduler)

        self.mesos_region = staticconf.read_string('aws.region')
        self.metrics_client = metrics_client or ClustermanMetricsBotoClient(
            self.mesos_region)
        self.default_signal = Signal(
            self.cluster,
            self.pool,
            self.scheduler,
            '__default__',
            DEFAULT_NAMESPACE,
            self.metrics_client,
            signal_namespace=staticconf.read_string(
                'autoscaling.default_signal_role'),
        )
        self.signal = self._get_signal_for_app(self.apps[0])
        logger.info('Initialization complete')
Example #13
0
def sensu_checkin(
    *,
    check_name: str,
    output: str,
    source: str,
    status: Status = Status.OK,
    app: Optional[str] = None,
    pool: Optional[str] = None,
    scheduler: Optional[str] = None,
    noop: bool = False,
    page: bool = True,
    **kwargs: Any,
) -> None:
    # This function feels like a massive hack, let's revisit and see if we can make it better (CLUSTERMAN-304)
    #
    # TODO (CLUSTERMAN-126) right now there's only one app per pool so use the global pool namespace
    # We assume the "pool" name and the "app" name are the same
    #
    # Use 'no-namespace' instead of None so we don't skip the per-cluster override
    pool_namespace = POOL_NAMESPACE.format(
        pool=app, scheduler=scheduler) if app else 'no-namespace'

    # read the sensu configuration from srv-configs; signals are not required to define this, so in the case
    # that they do not define anything, we fall back to the clusterman config.  The clusterman config can override
    # alerts on a per-cluster basis, so first check there; if nothing is defined there, fall back to the default,
    # which is required to be defined, so we know that someone is going to get the notification
    #
    sensu_config = dict(
        staticconf.read_list('sensu_config',
                             default=[{}],
                             namespace=pool_namespace).pop())
    if not sensu_config:
        sensu_config = dict(
            staticconf.read_list(f'clusters.{source}.sensu_config',
                                 default=[{}]).pop())
    if not sensu_config:
        sensu_config = dict(staticconf.read_list('sensu_config').pop())

    # If we've turned off paging in the config, we don't want this function to ever page
    config_page = sensu_config.pop('page', None)
    page = False if config_page is False else page

    # So we know where alerts are coming from precisely
    output += ''.join([
        '\n\nThis check came from:\n',
        f'- Cluster/region: {source}\n',
        f'- Pool: {pool}.{scheduler}\n' if pool else '',
        f'- App: {app}\n' if app else '',
    ])

    sensu_config.update({
        'name': check_name,
        'output': output,
        'source': source,
        'status': status.value,
        'page': page,
    })
    # values passed in to this function override config file values (is this really correct??)
    sensu_config.update(kwargs)

    pysensu_yelp = _get_sensu()
    if noop or not pysensu_yelp:
        logger.info(('Would have sent this event to Sensu:\n'
                     f'{pprint.pformat(sensu_config)}'))
        return

    # team and runbook are required entries in srv-configs, so we know this will go to the "right" place
    pysensu_yelp.send_event(**sensu_config)