def test_pool_manager_init(mock_pool_manager, mock_resource_groups): assert mock_pool_manager.cluster == 'mesos-test' assert mock_pool_manager.pool == 'bar' assert mock_pool_manager.scheduler == 'mesos' with staticconf.testing.MockConfiguration( { 'scaling_limits': { 'max_tasks_to_kill': 'inf', 'max_weight_to_add': 100, 'max_weight_to_remove': 100, 'min_capacity': 3, 'max_capacity': 3, }, }, namespace='bar.mesos_config', ), mock.patch( 'clusterman.aws.spot_fleet_resource_group.SpotFleetResourceGroup.load', return_value={}, ), mock.patch( 'clusterman.autoscaler.pool_manager.DrainingClient', autospec=True, ), mock.patch( 'clusterman.autoscaler.pool_manager.PoolManager.reload_state' ): mock_manager = PoolManager('mesos-test', 'bar', 'mesos') mock_manager.resource_groups = mock_resource_groups assert mock_manager.max_tasks_to_kill == float('inf')
def mark_stale(manager: PoolManager, dry_run: bool) -> str: if not dry_run and not ask_for_confirmation( f'Marking all resource groups in {manager.cluster}, {manager.pool}.{manager.scheduler} stale. Proceed? ' ): print('Aborting operation.') return '' manager.mark_stale(dry_run) return ( f'All resource groups in {manager.pool}.{manager.scheduler} on {manager.cluster} manually ' f'marked as stale by {getuser()}')
def mock_pool_manager(mock_resource_groups): with mock.patch( 'clusterman.aws.spot_fleet_resource_group.SpotFleetResourceGroup.load', return_value={}, ), mock.patch( 'clusterman.autoscaler.pool_manager.DrainingClient', autospec=True, ), mock.patch( 'clusterman.autoscaler.pool_manager.PoolManager.reload_state' ), mock.patch( 'clusterman.autoscaler.pool_manager.ClusterConnector.load', ): manager = PoolManager('mesos-test', 'bar', 'mesos') manager.resource_groups = mock_resource_groups return manager
def main(args: argparse.Namespace) -> None: if args.target_capacity and args.mark_stale: raise ValueError( 'Cannot specify --target-capacity and --mark-stale simultaneously') manager = PoolManager(args.cluster, args.pool, args.scheduler) log_messages = [] if args.target_capacity: log_message = change_target_capacity(manager, args.target_capacity, args.dry_run) log_messages.append(log_message) elif args.mark_stale: log_message = mark_stale(manager, args.dry_run) log_messages.append(log_message) for log_message in log_messages: if not log_message: continue print(log_message) if not args.dry_run: scribe_stream = get_autoscaler_scribe_stream( args.cluster, args.pool, args.scheduler) log_to_scribe(scribe_stream, f'{LOG_TEMPLATE} {log_message}')
def print_status(manager: PoolManager, args) -> None: sys.stdout.write('\n') print( f'Current status for the {manager.pool} pool in the {manager.cluster} cluster:\n' ) print( f'Resource groups (target capacity: {manager.target_capacity}, fulfilled: {manager.fulfilled_capacity}, ' f'non-orphan: {manager.non_orphan_fulfilled_capacity}):') node_metadatas = manager.get_node_metadatas() if args.verbose else {} for group in manager.resource_groups.values(): _write_resource_group_line(group) for metadata in node_metadatas: if (metadata.instance.group_id != group.id or (args.only_orphans and metadata.agent.state != AgentState.ORPHANED) or (args.only_idle and metadata.agent.state != AgentState.IDLE)): continue _write_agent_details(metadata) sys.stdout.write('\n') _write_summary(manager) sys.stdout.write('\n')
def configure_initial(self): setup_config(self.options) self.autoscaler = None self.logger = logger self.apps = [ self.options.pool ] # TODO (CLUSTERMAN-126) someday these should not be the same thing pool_manager = PoolManager( self.options.cluster, self.options.pool, self.options.scheduler, ) self.autoscaler = Autoscaler( self.options.cluster, self.options.pool, self.options.scheduler, self.apps, monitoring_enabled=(not self.options.dry_run), pool_manager=pool_manager, ) # We don't want to watch anything here because the autoscaler bootstrap script takes care of that for us self.config.watchers.clear()
def _status_json(manager: PoolManager, get_node_metadatas: bool) -> StatusJsonObject: node_metadatas = manager.get_node_metadatas() if get_node_metadatas else [] return { 'disabled': autoscaling_is_paused(manager.cluster, manager.pool, manager.scheduler, arrow.now()), 'target_capacity': manager.target_capacity, 'fulfilled_capacity': manager.fulfilled_capacity, 'non_orphan_fulfilled_capacity': manager.non_orphan_fulfilled_capacity, 'resource_groups': _get_resource_groups_json(manager.resource_groups.values(), node_metadatas), }
def make_pool_manager(context, num, rg_type): behave.use_fixture(boto_patches, context) behave.use_fixture(mock_agents_by_ip_and_tasks, context) context.rg_type = rg_type context.rg_num = int(num) behave.use_fixture(mock_reload_resource_groups, context) context.pool_manager = PoolManager('mesos-test', 'bar', 'mesos') context.rg_ids = [i for i in context.pool_manager.resource_groups] context.pool_manager.max_capacity = 101
def load_pool_managers(self) -> None: logger.info('Reloading all PoolManagers') self.pool_managers: Mapping[str, PoolManager] = {} for scheduler, pools in self.pools.items(): for pool in pools: logger.info( f'Loading resource groups for {pool}.{scheduler} on {self.options.cluster}' ) self.pool_managers[f'{pool}.{scheduler}'] = PoolManager( self.options.cluster, pool, scheduler)
def change_target_capacity(manager: PoolManager, target_capacity: str, dry_run: bool) -> str: old_target = manager.target_capacity requested_target = get_target_capacity_value(target_capacity, manager.pool, manager.scheduler) if not dry_run and not ask_for_confirmation( f'Modifying target capacity for {manager.cluster}, {manager.pool}.{manager.scheduler} ' f'from {old_target} to {requested_target}. Proceed? '): print('Aborting operation.') return '' new_target = manager.modify_target_capacity(requested_target, dry_run) return ( f'Target capacity for {manager.pool}.{manager.scheduler} on {manager.cluster} manually changed ' f'from {old_target} to {new_target} by {getuser()}')
def configure(self) -> None: setup_config(self.options) self.autoscaler = None self.logger = logger self.apps = [self.options.pool] # TODO (CLUSTERMAN-126) someday these should not be the same thing pool_manager = PoolManager( self.options.cluster, self.options.pool, self.options.scheduler, ) self.autoscaler = Autoscaler( self.options.cluster, self.options.pool, self.options.scheduler, self.apps, monitoring_enabled=(not self.options.dry_run), pool_manager=pool_manager, )
def make_pool_manager(context, num, rg_type): behave.use_fixture(boto_patches, context) behave.use_fixture(mock_agents_by_ip_and_tasks, context) context.rg_type = rg_type with mock.patch( 'clusterman.aws.auto_scaling_resource_group.AutoScalingResourceGroup.load', return_value={}, ) as mock_asg_load, mock.patch( 'clusterman.aws.spot_fleet_resource_group.SpotFleetResourceGroup.load', return_value={}, ) as mock_sfr_load, mock.patch( 'clusterman.aws.ec2_fleet_resource_group.EC2FleetResourceGroup.load', return_value={}, ) as mock_fleet_load: if context.rg_type == 'asg': mock_asg_load.return_value = mock_asgs(int(num), context.subnet_id) elif context.rg_type == 'sfr': mock_sfr_load.return_value = mock_sfrs(int(num), context.subnet_id) elif context.rg_type == 'fleet': mock_fleet_load.return_value = mock_fleets(int(num), context.subnet_id) context.pool_manager = PoolManager('mesos-test', 'bar', 'mesos') context.rg_ids = [i for i in context.pool_manager.resource_groups] context.pool_manager.max_capacity = 101
def __init__( self, cluster: str, pool: str, scheduler: str, apps: List[str], pool_manager: Optional[PoolManager] = None, metrics_client: Optional[ClustermanMetricsBotoClient] = None, monitoring_enabled: bool = True, ) -> None: """ Class containing the core logic for autoscaling a cluster :param cluster: the name of the cluster to autoscale :param pool: the name of the pool to autoscale :param apps: a list of apps running on the pool :param pool_manager: a PoolManager object (used for simulations) :param metrics_client: a ClustermanMetricsBotoClient object (used for simulations) :param monitoring_enabled: set to False to disable sensu alerts during scaling """ self.cluster = cluster self.pool = pool self.scheduler = scheduler self.apps = apps self.monitoring_enabled = monitoring_enabled # TODO: handle multiple apps in the autoscaler (CLUSTERMAN-126) if len(self.apps) > 1: raise NotImplementedError( 'Scaling multiple apps in a cluster is not yet supported') logger.info( f'Initializing autoscaler engine for {self.pool} in {self.cluster}...' ) gauge_dimensions = {'cluster': cluster, 'pool': pool} monitoring_client = get_monitoring_client() self.target_capacity_gauge = monitoring_client.create_gauge( TARGET_CAPACITY_GAUGE_NAME, gauge_dimensions) self.resource_request_gauges: Dict[str, Any] = {} for resource in ('cpus', 'mem', 'disk'): self.resource_request_gauges[ resource] = monitoring_client.create_gauge( RESOURCE_GAUGE_BASE_NAME.format(resource=resource), gauge_dimensions, ) self.autoscaling_config = get_autoscaling_config( POOL_NAMESPACE.format(pool=self.pool, scheduler=self.scheduler), ) self.pool_manager = pool_manager or PoolManager( self.cluster, self.pool, self.scheduler) self.mesos_region = staticconf.read_string('aws.region') self.metrics_client = metrics_client or ClustermanMetricsBotoClient( self.mesos_region) self.default_signal = Signal( self.cluster, self.pool, self.scheduler, '__default__', DEFAULT_NAMESPACE, self.metrics_client, signal_namespace=staticconf.read_string( 'autoscaling.default_signal_role'), ) self.signal = self._get_signal_for_app(self.apps[0]) logger.info('Initialization complete')
def main(args: argparse.Namespace) -> None: # pragma: no cover manager = PoolManager(args.cluster, args.pool, args.scheduler) if args.json: print_status_json(manager) else: print_status(manager, args)