Example #1
0
    def configure_initial(self):
        setup_config(self.options)
        self.autoscaler = None
        self.logger = logger

        self.apps = [
            self.options.pool
        ]  # TODO (CLUSTERMAN-126) someday these should not be the same thing

        pool_manager = PoolManager(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
        )
        self.autoscaler = Autoscaler(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
            self.apps,
            monitoring_enabled=(not self.options.dry_run),
            pool_manager=pool_manager,
        )

        # We don't want to watch anything here because the autoscaler bootstrap script takes care of that for us
        self.config.watchers.clear()
Example #2
0
class AutoscalerBatch(BatchRunningSentinelMixin):
    def parse_args(self):
        parser = argparse.ArgumentParser()
        arg_group = parser.add_argument_group('AutoscalerBatch options')
        add_cluster_arg(arg_group, required=True)
        add_pool_arg(arg_group)
        add_scheduler_arg(arg_group)
        add_cluster_config_directory_arg(arg_group)
        add_env_config_path_arg(arg_group)
        arg_group.add_argument(
            '--dry-run',
            default=False,
            action='store_true',
            help='If true, will only log autoscaling decisions instead of modifying capacities',
        )
        self.options = parser.parse_args()

    def configure(self) -> None:
        setup_config(self.options)
        self.autoscaler = None
        self.logger = logger

        self.apps = [self.options.pool]  # TODO (CLUSTERMAN-126) someday these should not be the same thing

        pool_manager = PoolManager(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
        )
        self.autoscaler = Autoscaler(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
            self.apps,
            monitoring_enabled=(not self.options.dry_run),
            pool_manager=pool_manager,
        )

    def _autoscale(self) -> None:
        assert self.autoscaler
        time.sleep(splay_event_time(
            self.autoscaler.run_frequency,
            self.__class__.__name__ + self.options.cluster + self.options.pool,
        ))
        with suppress_request_limit_exceeded():
            self.autoscaler.run(dry_run=self.options.dry_run)

    def run(self) -> None:
        self.make_running_sentinel()
        while True:
            self._autoscale()
Example #3
0
def mock_autoscaler():
    autoscaling_config_dict = {
        'default_signal_role': 'clusterman',
        'setpoint': 0.7,
        'target_capacity_margin': 0.1,
    }

    with mock.patch(
        'clusterman.autoscaler.autoscaler.ClustermanMetricsBotoClient',
        autospec=True,
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.PoolManager',
        autospec=True,
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.Autoscaler._get_signal_for_app',
        autospec=True,
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.get_monitoring_client',
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.ExternalSignal',
    ), mock.patch(
        'clusterman.autoscaler.autoscaler.PendingPodsSignal',
    ), staticconf.testing.PatchConfiguration(
        {'autoscaling': autoscaling_config_dict},
    ):
        mock_autoscaler = Autoscaler('mesos-test', 'bar', 'mesos', ['bar'], monitoring_enabled=False)
        mock_autoscaler.pool_manager.cluster_connector = mock.Mock()

    mock_autoscaler.pool_manager.target_capacity = 300
    mock_autoscaler.pool_manager.min_capacity = staticconf.read_int(
        'scaling_limits.min_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos')
    )
    mock_autoscaler.pool_manager.max_capacity = staticconf.read_int(
        'scaling_limits.max_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos')
    )
    mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0

    mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol)
    mock_autoscaler.non_orphan_capacity_gauge = mock.Mock(spec=GaugeProtocol)
    mock_autoscaler.resource_request_gauges = {
        'mem': mock.Mock(spec=GaugeProtocol),
        'cpus': mock.Mock(spec=GaugeProtocol),
        'disk': mock.Mock(spec=GaugeProtocol),
        'gpus': mock.Mock(spec=GaugeProtocol),
    }
    return mock_autoscaler
Example #4
0
def autoscaler(context):
    behave.use_fixture(autoscaler_patches, context)
    context.autoscaler = Autoscaler(
        cluster='mesos-test',
        pool='bar',
        apps=['bar'],
        scheduler='mesos',
        metrics_client=mock.Mock(),
        monitoring_enabled=False,
    )
Example #5
0
    def configure(self) -> None:
        setup_config(self.options)
        self.autoscaler = None
        self.logger = logger

        self.apps = [self.options.pool]  # TODO (CLUSTERMAN-126) someday these should not be the same thing

        pool_manager = PoolManager(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
        )
        self.autoscaler = Autoscaler(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
            self.apps,
            monitoring_enabled=(not self.options.dry_run),
            pool_manager=pool_manager,
        )
Example #6
0
def mesos_autoscaler(context):
    behave.use_fixture(autoscaler_patches, context)
    if hasattr(context, 'allocated_cpus'):
        context.autoscaler.metrics_client.get_metric_values.side_effect = make_mock_scaling_metrics(
            context.allocated_cpus,
            context.boost,
        )
    context.autoscaler = Autoscaler(
        cluster='mesos-test',
        pool='bar',
        apps=['bar'],
        scheduler='mesos',
        metrics_client=mock.Mock(),
        monitoring_enabled=False,
    )
Example #7
0
def test_autoscaler_init_too_many_apps():
    with pytest.raises(NotImplementedError):
        Autoscaler('mesos-test',
                   'bar',
                   'mesos', ['app1', 'app2'],
                   monitoring_enabled=False)
Example #8
0
class AutoscalerBatch(BatchDaemon, BatchLoggingMixin,
                      BatchRunningSentinelMixin):
    notify_emails = ['*****@*****.**']

    @batch_command_line_arguments
    def parse_args(self, parser):
        arg_group = parser.add_argument_group('AutoscalerBatch options')
        add_cluster_arg(arg_group, required=True)
        add_pool_arg(arg_group)
        add_scheduler_arg(arg_group)
        add_cluster_config_directory_arg(arg_group)
        add_env_config_path_arg(arg_group)
        arg_group.add_argument(
            '--dry-run',
            default=False,
            action='store_true',
            help=
            'If true, will only log autoscaling decisions instead of modifying capacities',
        )

    @batch_configure
    def configure_initial(self):
        setup_config(self.options)
        self.autoscaler = None
        self.logger = logger

        self.apps = [
            self.options.pool
        ]  # TODO (CLUSTERMAN-126) someday these should not be the same thing
        pool_manager = PoolManager(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
        )
        self.autoscaler = Autoscaler(
            self.options.cluster,
            self.options.pool,
            self.options.scheduler,
            self.apps,
            monitoring_enabled=(not self.options.dry_run),
            pool_manager=pool_manager,
        )

        # We don't want to watch anything here because the autoscaler bootstrap script takes care of that for us
        self.clear_watchers()

    def _get_local_log_stream(self, clog_prefix=None):
        # Overrides the yelp_batch default, which is tmp_batch_<filename> (autoscaler in this case)

        # This controls the name of the scribe log for this batch. Without this, the log
        # conflicts with other batches (like the Kew autoscaler).  We create a separate log for each
        # cluster and (non-default) pool, for easy distinguishmentability
        return get_autoscaler_scribe_stream(self.options.cluster,
                                            self.options.pool,
                                            self.options.scheduler)

    @sensu_alert_triage()
    def _autoscale(self):
        time.sleep(
            splay_event_time(
                self.autoscaler.run_frequency,
                self.get_name() + self.options.cluster + self.options.pool,
            ))
        with suppress_request_limit_exceeded():
            self.autoscaler.run(dry_run=self.options.dry_run)

    def run(self):
        # self.running is a property from yelp_batch which checks version_checker if a watcher config has changed.
        # If so, the entire batch restarts and configs for the service are reloaded.
        while self.running:
            try:
                self._autoscale()
            except (PoolConnectionError, EndpointConnectionError) as e:
                logger.exception(f'Encountered a connection error: {e}')

    def _do_sensu_checkins(self, service_failed, msg):
        check_every = ('{minutes}m'.format(
            minutes=int(self.autoscaler.run_frequency //
                        60)) if self.autoscaler else DEFAULT_CHECK_EVERY)
        # magic-y numbers here; an alert will time out after two autoscaler run periods plus a five minute buffer
        alert_delay = ('{minutes}m'.format(
            minutes=int(self.autoscaler.run_frequency // 60) * 2 +
            5) if self.autoscaler else DEFAULT_TTL)

        sensu_args = dict(
            check_name=SERVICE_CHECK_NAME,
            scheduler=self.options.scheduler,
            app=self.apps[0],  # TODO (CLUSTERMAN-126)
            check_every=check_every,
            source=f'{self.options.cluster}_{self.options.pool}',
            ttl=alert_delay,
            alert_after=alert_delay,
            noop=self.options.dry_run,
            pool=self.options.pool,
        )

        if service_failed:
            sensu_args[
                'output'] = f'FAILED: clusterman autoscaler failed ({msg})'
            sensu_args['status'] = Status.CRITICAL
        else:
            sensu_args['output'] = 'OK: clusterman autoscaler is fine'
        sensu_checkin(**sensu_args)
Example #9
0
    def _make_autoscaler(self, autoscaler_config_file: str) -> None:
        fetch_count, signal_count = setup_signals_environment(
            self.metadata.pool, self.metadata.scheduler)
        signal_dir = os.path.join(os.path.expanduser('~'), '.cache',
                                  'clusterman')

        endpoint_url = staticconf.read_string('aws.endpoint_url',
                                              '').format(svc='s3')
        env = os.environ.copy()
        if endpoint_url:
            env['AWS_ENDPOINT_URL_ARGS'] = f'--endpoint-url {endpoint_url}'

        for i in range(fetch_count):
            subprocess.run(['fetch_clusterman_signal',
                            str(i), signal_dir],
                           check=True,
                           env=env)
        for i in range(signal_count):
            subprocess.Popen(['run_clusterman_signal',
                              str(i), signal_dir],
                             env=env)

        with open(autoscaler_config_file) as f:
            autoscaler_config = yaml.safe_load(f)
        configs = autoscaler_config.get('configs', [])
        if 'sfrs' in autoscaler_config:
            aws_configs = ec2.describe_spot_fleet_requests(
                SpotFleetRequestIds=autoscaler_config['sfrs'])
            configs.extend([
                config['SpotFleetRequestConfig']
                for config in aws_configs['SpotFleetRequestConfigs']
            ])
        pool_manager = SimulatedPoolManager(self.metadata.cluster,
                                            self.metadata.pool, configs, self)
        metric_values = self.metrics_client.get_metric_values(
            'target_capacity',
            METADATA,
            self.start_time.timestamp,
            # metrics collector runs 1x/min, but we'll try to get five data points in case some data is missing
            self.start_time.shift(minutes=5).timestamp,
            use_cache=False,
            extra_dimensions=get_cluster_dimensions(self.metadata.cluster,
                                                    self.metadata.pool,
                                                    self.metadata.scheduler),
        )
        # take the earliest data point available - this is a Decimal, which doesn't play nicely, so convert to an int
        with patch_join_delay():
            actual_target_capacity = int(
                metric_values['target_capacity'][0][1])
            pool_manager.modify_target_capacity(actual_target_capacity,
                                                force=True,
                                                prune=False)

        for config in configs:
            for spec in config['LaunchSpecifications']:
                self.markets |= {get_instance_market(spec)}
        self.autoscaler = Autoscaler(
            self.metadata.cluster,
            self.metadata.pool,
            self.metadata.scheduler,
            [self.metadata.pool],
            pool_manager=pool_manager,
            metrics_client=self.metrics_client,
            monitoring_enabled=False,  # no sensu alerts during simulations
        )
Example #10
0
def create_k8s_autoscaler(context,
                          prevent_scale_down_after_capacity_loss=False):
    behave.use_fixture(autoscaler_patches, context)
    context.mock_cluster_connector.__class__ = KubernetesClusterConnector
    context.mock_cluster_connector.get_cluster_allocated_resources.return_value = ClustermanResources(
        cpus=context.allocated_cpus, )
    context.mock_cluster_connector._pending_pods = []
    if float(context.pending_cpus) > 0:
        context.mock_cluster_connector.get_unschedulable_pods = \
            lambda: KubernetesClusterConnector.get_unschedulable_pods(context.mock_cluster_connector)
        context.mock_cluster_connector._get_pod_unschedulable_reason.side_effect = lambda pod: (
            PodUnschedulableReason.InsufficientResources
            if pod.metadata.name == 'pod1' else PodUnschedulableReason.Unknown)
        context.mock_cluster_connector._pending_pods = [
            V1Pod(
                metadata=V1ObjectMeta(name='pod1'),
                status=V1PodStatus(
                    phase='Pending',
                    conditions=[
                        V1PodCondition(status='False',
                                       type='PodScheduled',
                                       reason='Unschedulable')
                    ],
                ),
                spec=V1PodSpec(containers=[
                    V1Container(name='container1',
                                resources=V1ResourceRequirements(
                                    requests={'cpu': context.pending_cpus})),
                ]),
            ),
            V1Pod(
                metadata=V1ObjectMeta(name='pod2'),
                status=V1PodStatus(
                    phase='Pending',
                    conditions=[
                        V1PodCondition(status='False',
                                       type='PodScheduled',
                                       reason='Unschedulable')
                    ],
                ),
                spec=V1PodSpec(containers=[
                    V1Container(name='container1',
                                resources=V1ResourceRequirements(
                                    requests={'cpu': context.pending_cpus})),
                ]),
            ),
        ]

    context.autoscaler = Autoscaler(
        cluster='kube-test',
        pool='bar',
        apps=['bar'],
        scheduler='kubernetes',
        metrics_client=mock.Mock(),
        monitoring_enabled=False,
    )

    if prevent_scale_down_after_capacity_loss:
        context.autoscaler.autoscaling_config = AutoscalingConfig(
            excluded_resources=[],
            setpoint=0.7,
            target_capacity_margin=0.1,
            prevent_scale_down_after_capacity_loss=True,
            instance_loss_threshold=0)