def configure_initial(self): setup_config(self.options) self.autoscaler = None self.logger = logger self.apps = [ self.options.pool ] # TODO (CLUSTERMAN-126) someday these should not be the same thing pool_manager = PoolManager( self.options.cluster, self.options.pool, self.options.scheduler, ) self.autoscaler = Autoscaler( self.options.cluster, self.options.pool, self.options.scheduler, self.apps, monitoring_enabled=(not self.options.dry_run), pool_manager=pool_manager, ) # We don't want to watch anything here because the autoscaler bootstrap script takes care of that for us self.config.watchers.clear()
class AutoscalerBatch(BatchRunningSentinelMixin): def parse_args(self): parser = argparse.ArgumentParser() arg_group = parser.add_argument_group('AutoscalerBatch options') add_cluster_arg(arg_group, required=True) add_pool_arg(arg_group) add_scheduler_arg(arg_group) add_cluster_config_directory_arg(arg_group) add_env_config_path_arg(arg_group) arg_group.add_argument( '--dry-run', default=False, action='store_true', help='If true, will only log autoscaling decisions instead of modifying capacities', ) self.options = parser.parse_args() def configure(self) -> None: setup_config(self.options) self.autoscaler = None self.logger = logger self.apps = [self.options.pool] # TODO (CLUSTERMAN-126) someday these should not be the same thing pool_manager = PoolManager( self.options.cluster, self.options.pool, self.options.scheduler, ) self.autoscaler = Autoscaler( self.options.cluster, self.options.pool, self.options.scheduler, self.apps, monitoring_enabled=(not self.options.dry_run), pool_manager=pool_manager, ) def _autoscale(self) -> None: assert self.autoscaler time.sleep(splay_event_time( self.autoscaler.run_frequency, self.__class__.__name__ + self.options.cluster + self.options.pool, )) with suppress_request_limit_exceeded(): self.autoscaler.run(dry_run=self.options.dry_run) def run(self) -> None: self.make_running_sentinel() while True: self._autoscale()
def mock_autoscaler(): autoscaling_config_dict = { 'default_signal_role': 'clusterman', 'setpoint': 0.7, 'target_capacity_margin': 0.1, } with mock.patch( 'clusterman.autoscaler.autoscaler.ClustermanMetricsBotoClient', autospec=True, ), mock.patch( 'clusterman.autoscaler.autoscaler.PoolManager', autospec=True, ), mock.patch( 'clusterman.autoscaler.autoscaler.Autoscaler._get_signal_for_app', autospec=True, ), mock.patch( 'clusterman.autoscaler.autoscaler.get_monitoring_client', ), mock.patch( 'clusterman.autoscaler.autoscaler.ExternalSignal', ), mock.patch( 'clusterman.autoscaler.autoscaler.PendingPodsSignal', ), staticconf.testing.PatchConfiguration( {'autoscaling': autoscaling_config_dict}, ): mock_autoscaler = Autoscaler('mesos-test', 'bar', 'mesos', ['bar'], monitoring_enabled=False) mock_autoscaler.pool_manager.cluster_connector = mock.Mock() mock_autoscaler.pool_manager.target_capacity = 300 mock_autoscaler.pool_manager.min_capacity = staticconf.read_int( 'scaling_limits.min_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos') ) mock_autoscaler.pool_manager.max_capacity = staticconf.read_int( 'scaling_limits.max_capacity', namespace=POOL_NAMESPACE.format(pool='bar', scheduler='mesos') ) mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0 mock_autoscaler.target_capacity_gauge = mock.Mock(spec=GaugeProtocol) mock_autoscaler.non_orphan_capacity_gauge = mock.Mock(spec=GaugeProtocol) mock_autoscaler.resource_request_gauges = { 'mem': mock.Mock(spec=GaugeProtocol), 'cpus': mock.Mock(spec=GaugeProtocol), 'disk': mock.Mock(spec=GaugeProtocol), 'gpus': mock.Mock(spec=GaugeProtocol), } return mock_autoscaler
def autoscaler(context): behave.use_fixture(autoscaler_patches, context) context.autoscaler = Autoscaler( cluster='mesos-test', pool='bar', apps=['bar'], scheduler='mesos', metrics_client=mock.Mock(), monitoring_enabled=False, )
def configure(self) -> None: setup_config(self.options) self.autoscaler = None self.logger = logger self.apps = [self.options.pool] # TODO (CLUSTERMAN-126) someday these should not be the same thing pool_manager = PoolManager( self.options.cluster, self.options.pool, self.options.scheduler, ) self.autoscaler = Autoscaler( self.options.cluster, self.options.pool, self.options.scheduler, self.apps, monitoring_enabled=(not self.options.dry_run), pool_manager=pool_manager, )
def mesos_autoscaler(context): behave.use_fixture(autoscaler_patches, context) if hasattr(context, 'allocated_cpus'): context.autoscaler.metrics_client.get_metric_values.side_effect = make_mock_scaling_metrics( context.allocated_cpus, context.boost, ) context.autoscaler = Autoscaler( cluster='mesos-test', pool='bar', apps=['bar'], scheduler='mesos', metrics_client=mock.Mock(), monitoring_enabled=False, )
def test_autoscaler_init_too_many_apps(): with pytest.raises(NotImplementedError): Autoscaler('mesos-test', 'bar', 'mesos', ['app1', 'app2'], monitoring_enabled=False)
class AutoscalerBatch(BatchDaemon, BatchLoggingMixin, BatchRunningSentinelMixin): notify_emails = ['*****@*****.**'] @batch_command_line_arguments def parse_args(self, parser): arg_group = parser.add_argument_group('AutoscalerBatch options') add_cluster_arg(arg_group, required=True) add_pool_arg(arg_group) add_scheduler_arg(arg_group) add_cluster_config_directory_arg(arg_group) add_env_config_path_arg(arg_group) arg_group.add_argument( '--dry-run', default=False, action='store_true', help= 'If true, will only log autoscaling decisions instead of modifying capacities', ) @batch_configure def configure_initial(self): setup_config(self.options) self.autoscaler = None self.logger = logger self.apps = [ self.options.pool ] # TODO (CLUSTERMAN-126) someday these should not be the same thing pool_manager = PoolManager( self.options.cluster, self.options.pool, self.options.scheduler, ) self.autoscaler = Autoscaler( self.options.cluster, self.options.pool, self.options.scheduler, self.apps, monitoring_enabled=(not self.options.dry_run), pool_manager=pool_manager, ) # We don't want to watch anything here because the autoscaler bootstrap script takes care of that for us self.clear_watchers() def _get_local_log_stream(self, clog_prefix=None): # Overrides the yelp_batch default, which is tmp_batch_<filename> (autoscaler in this case) # This controls the name of the scribe log for this batch. Without this, the log # conflicts with other batches (like the Kew autoscaler). We create a separate log for each # cluster and (non-default) pool, for easy distinguishmentability return get_autoscaler_scribe_stream(self.options.cluster, self.options.pool, self.options.scheduler) @sensu_alert_triage() def _autoscale(self): time.sleep( splay_event_time( self.autoscaler.run_frequency, self.get_name() + self.options.cluster + self.options.pool, )) with suppress_request_limit_exceeded(): self.autoscaler.run(dry_run=self.options.dry_run) def run(self): # self.running is a property from yelp_batch which checks version_checker if a watcher config has changed. # If so, the entire batch restarts and configs for the service are reloaded. while self.running: try: self._autoscale() except (PoolConnectionError, EndpointConnectionError) as e: logger.exception(f'Encountered a connection error: {e}') def _do_sensu_checkins(self, service_failed, msg): check_every = ('{minutes}m'.format( minutes=int(self.autoscaler.run_frequency // 60)) if self.autoscaler else DEFAULT_CHECK_EVERY) # magic-y numbers here; an alert will time out after two autoscaler run periods plus a five minute buffer alert_delay = ('{minutes}m'.format( minutes=int(self.autoscaler.run_frequency // 60) * 2 + 5) if self.autoscaler else DEFAULT_TTL) sensu_args = dict( check_name=SERVICE_CHECK_NAME, scheduler=self.options.scheduler, app=self.apps[0], # TODO (CLUSTERMAN-126) check_every=check_every, source=f'{self.options.cluster}_{self.options.pool}', ttl=alert_delay, alert_after=alert_delay, noop=self.options.dry_run, pool=self.options.pool, ) if service_failed: sensu_args[ 'output'] = f'FAILED: clusterman autoscaler failed ({msg})' sensu_args['status'] = Status.CRITICAL else: sensu_args['output'] = 'OK: clusterman autoscaler is fine' sensu_checkin(**sensu_args)
def _make_autoscaler(self, autoscaler_config_file: str) -> None: fetch_count, signal_count = setup_signals_environment( self.metadata.pool, self.metadata.scheduler) signal_dir = os.path.join(os.path.expanduser('~'), '.cache', 'clusterman') endpoint_url = staticconf.read_string('aws.endpoint_url', '').format(svc='s3') env = os.environ.copy() if endpoint_url: env['AWS_ENDPOINT_URL_ARGS'] = f'--endpoint-url {endpoint_url}' for i in range(fetch_count): subprocess.run(['fetch_clusterman_signal', str(i), signal_dir], check=True, env=env) for i in range(signal_count): subprocess.Popen(['run_clusterman_signal', str(i), signal_dir], env=env) with open(autoscaler_config_file) as f: autoscaler_config = yaml.safe_load(f) configs = autoscaler_config.get('configs', []) if 'sfrs' in autoscaler_config: aws_configs = ec2.describe_spot_fleet_requests( SpotFleetRequestIds=autoscaler_config['sfrs']) configs.extend([ config['SpotFleetRequestConfig'] for config in aws_configs['SpotFleetRequestConfigs'] ]) pool_manager = SimulatedPoolManager(self.metadata.cluster, self.metadata.pool, configs, self) metric_values = self.metrics_client.get_metric_values( 'target_capacity', METADATA, self.start_time.timestamp, # metrics collector runs 1x/min, but we'll try to get five data points in case some data is missing self.start_time.shift(minutes=5).timestamp, use_cache=False, extra_dimensions=get_cluster_dimensions(self.metadata.cluster, self.metadata.pool, self.metadata.scheduler), ) # take the earliest data point available - this is a Decimal, which doesn't play nicely, so convert to an int with patch_join_delay(): actual_target_capacity = int( metric_values['target_capacity'][0][1]) pool_manager.modify_target_capacity(actual_target_capacity, force=True, prune=False) for config in configs: for spec in config['LaunchSpecifications']: self.markets |= {get_instance_market(spec)} self.autoscaler = Autoscaler( self.metadata.cluster, self.metadata.pool, self.metadata.scheduler, [self.metadata.pool], pool_manager=pool_manager, metrics_client=self.metrics_client, monitoring_enabled=False, # no sensu alerts during simulations )
def create_k8s_autoscaler(context, prevent_scale_down_after_capacity_loss=False): behave.use_fixture(autoscaler_patches, context) context.mock_cluster_connector.__class__ = KubernetesClusterConnector context.mock_cluster_connector.get_cluster_allocated_resources.return_value = ClustermanResources( cpus=context.allocated_cpus, ) context.mock_cluster_connector._pending_pods = [] if float(context.pending_cpus) > 0: context.mock_cluster_connector.get_unschedulable_pods = \ lambda: KubernetesClusterConnector.get_unschedulable_pods(context.mock_cluster_connector) context.mock_cluster_connector._get_pod_unschedulable_reason.side_effect = lambda pod: ( PodUnschedulableReason.InsufficientResources if pod.metadata.name == 'pod1' else PodUnschedulableReason.Unknown) context.mock_cluster_connector._pending_pods = [ V1Pod( metadata=V1ObjectMeta(name='pod1'), status=V1PodStatus( phase='Pending', conditions=[ V1PodCondition(status='False', type='PodScheduled', reason='Unschedulable') ], ), spec=V1PodSpec(containers=[ V1Container(name='container1', resources=V1ResourceRequirements( requests={'cpu': context.pending_cpus})), ]), ), V1Pod( metadata=V1ObjectMeta(name='pod2'), status=V1PodStatus( phase='Pending', conditions=[ V1PodCondition(status='False', type='PodScheduled', reason='Unschedulable') ], ), spec=V1PodSpec(containers=[ V1Container(name='container1', resources=V1ResourceRequirements( requests={'cpu': context.pending_cpus})), ]), ), ] context.autoscaler = Autoscaler( cluster='kube-test', pool='bar', apps=['bar'], scheduler='kubernetes', metrics_client=mock.Mock(), monitoring_enabled=False, ) if prevent_scale_down_after_capacity_loss: context.autoscaler.autoscaling_config = AutoscalingConfig( excluded_resources=[], setpoint=0.7, target_capacity_margin=0.1, prevent_scale_down_after_capacity_loss=True, instance_loss_threshold=0)