def terminate_instances_by_id(self, instance_ids: List[str], batch_size: int = 500) -> Sequence[str]: """ Terminate instances in this resource group :param instance_ids: a list of instance IDs to terminate :param batch_size: number of instances to terminate at one time :returns: a list of terminated instance IDs """ if not instance_ids: logger.warning(f'No instances to terminate in {self.group_id}') return [] instance_weights = {} for instance in ec2_describe_instances(instance_ids): instance_market = get_instance_market(instance) if not instance_market.az: logger.warning( f"Instance {instance['InstanceId']} missing AZ info, likely already terminated so skipping", ) instance_ids.remove(instance['InstanceId']) continue instance_weights[instance['InstanceId']] = self.market_weight( get_instance_market(instance)) # AWS API recommends not terminating more than 1000 instances at a time, and to # terminate larger numbers in batches terminated_instance_ids = [] for batch in range(0, len(instance_ids), batch_size): response = ec2.terminate_instances( InstanceIds=instance_ids[batch:batch + batch_size]) terminated_instance_ids.extend([ instance['InstanceId'] for instance in response['TerminatingInstances'] ]) # It's possible that not every instance is terminated. The most likely cause for this # is that AWS terminated the instance in between getting its status and the terminate_instances # request. This is probably fine but let's log a warning just in case. missing_instances = set(instance_ids) - set(terminated_instance_ids) if missing_instances: logger.warning( 'Some instances could not be terminated; they were probably killed previously' ) logger.warning(f'Missing instances: {list(missing_instances)}') terminated_capacity = sum(instance_weights[i] for i in instance_ids) logger.info( f'{self.id} terminated weight: {terminated_capacity}; instances: {terminated_instance_ids}' ) return terminated_instance_ids
def __init__(self, config, simulator): """ :param config: a configuration dictionary that follows the SFR launch configuration schema. Not all values needed for the SFR config are required here. Specifically, we require the following elements: { 'LaunchSpecifications': [ { 'InstanceType': AWS EC2 instance type name, 'SubnetId': Subnet the instance should be launched in (should map to a region in common/aws.py), 'SpotPrice': How much, in terms of price per unit capacity, to bid in this market, 'WeightedCapacity': How much to weight instances in this market by when calculating capacity }, ... ] } """ SimulatedAWSCluster.__init__(self, simulator) AWSResourceGroup.__init__(self, f'ssfr-{uuid4()}') self._instance_types = {} for spec in config['LaunchSpecifications']: bid_price = float(spec['SpotPrice']) * spec['WeightedCapacity'] market = get_instance_market(spec) self._instance_types[market] = SpotMarketConfig( bid_price, spec['WeightedCapacity']) self.__target_capacity = 0 self.allocation_strategy = config['AllocationStrategy'] if self.allocation_strategy != 'diversified': raise NotImplementedError( f'{self.allocation_strategy} not supported')
def get_instance_metadatas( self, state_filter: Optional[Collection[str]] = None ) -> Sequence[InstanceMetadata]: instance_metadatas = [] for instance_dict in ec2_describe_instances( instance_ids=self.instance_ids): aws_state = instance_dict['State']['Name'] if state_filter and aws_state not in state_filter: continue instance_market = get_instance_market(instance_dict) instance_ip = instance_dict.get('PrivateIpAddress') hostname = gethostbyaddr(instance_ip)[0] if instance_ip else None metadata = InstanceMetadata( group_id=self.id, hostname=hostname, instance_id=instance_dict['InstanceId'], ip_address=instance_ip, is_stale=(instance_dict['InstanceId'] in self.stale_instance_ids), market=instance_market, state=aws_state, uptime=(arrow.now() - arrow.get(instance_dict['LaunchTime'])), weight=self.market_weight(instance_market), ) instance_metadatas.append(metadata) return instance_metadatas
def _instances_by_market(self): """ Responses from this API call are cached to prevent hitting any AWS request limits """ instance_dict: Mapping[InstanceMarket, List[Mapping]] = defaultdict(list) for instance in ec2_describe_instances(self.instance_ids): instance_dict[get_instance_market(instance)].append(instance) return instance_dict
def _generate_market_weights(self) -> Mapping[InstanceMarket, float]: if not self._configuration: return {} return { get_instance_market(spec): spec['WeightedCapacity'] for spec in self._configuration['SpotFleetRequestConfig'] ['LaunchSpecifications'] }
def _make_autoscaler(self, autoscaler_config_file: str) -> None: fetch_count, signal_count = setup_signals_environment( self.metadata.pool, self.metadata.scheduler) signal_dir = os.path.join(os.path.expanduser('~'), '.cache', 'clusterman') endpoint_url = staticconf.read_string('aws.endpoint_url', '').format(svc='s3') env = os.environ.copy() if endpoint_url: env['AWS_ENDPOINT_URL_ARGS'] = f'--endpoint-url {endpoint_url}' for i in range(fetch_count): subprocess.run(['fetch_clusterman_signal', str(i), signal_dir], check=True, env=env) for i in range(signal_count): subprocess.Popen(['run_clusterman_signal', str(i), signal_dir], env=env) with open(autoscaler_config_file) as f: autoscaler_config = yaml.safe_load(f) configs = autoscaler_config.get('configs', []) if 'sfrs' in autoscaler_config: aws_configs = ec2.describe_spot_fleet_requests( SpotFleetRequestIds=autoscaler_config['sfrs']) configs.extend([ config['SpotFleetRequestConfig'] for config in aws_configs['SpotFleetRequestConfigs'] ]) pool_manager = SimulatedPoolManager(self.metadata.cluster, self.metadata.pool, configs, self) metric_values = self.metrics_client.get_metric_values( 'target_capacity', METADATA, self.start_time.timestamp, # metrics collector runs 1x/min, but we'll try to get five data points in case some data is missing self.start_time.shift(minutes=5).timestamp, use_cache=False, extra_dimensions=get_cluster_dimensions(self.metadata.cluster, self.metadata.pool, self.metadata.scheduler), ) # take the earliest data point available - this is a Decimal, which doesn't play nicely, so convert to an int with patch_join_delay(): actual_target_capacity = int( metric_values['target_capacity'][0][1]) pool_manager.modify_target_capacity(actual_target_capacity, force=True, prune=False) for config in configs: for spec in config['LaunchSpecifications']: self.markets |= {get_instance_market(spec)} self.autoscaler = Autoscaler( self.metadata.cluster, self.metadata.pool, self.metadata.scheduler, [self.metadata.pool], pool_manager=pool_manager, metrics_client=self.metrics_client, monitoring_enabled=False, # no sensu alerts during simulations )