예제 #1
0
    def terminate_instances_by_id(self,
                                  instance_ids: List[str],
                                  batch_size: int = 500) -> Sequence[str]:
        """ Terminate instances in this resource group

        :param instance_ids: a list of instance IDs to terminate
        :param batch_size: number of instances to terminate at one time
        :returns: a list of terminated instance IDs
        """
        if not instance_ids:
            logger.warning(f'No instances to terminate in {self.group_id}')
            return []

        instance_weights = {}
        for instance in ec2_describe_instances(instance_ids):
            instance_market = get_instance_market(instance)
            if not instance_market.az:
                logger.warning(
                    f"Instance {instance['InstanceId']} missing AZ info, likely already terminated so skipping",
                )
                instance_ids.remove(instance['InstanceId'])
                continue
            instance_weights[instance['InstanceId']] = self.market_weight(
                get_instance_market(instance))

        # AWS API recommends not terminating more than 1000 instances at a time, and to
        # terminate larger numbers in batches
        terminated_instance_ids = []
        for batch in range(0, len(instance_ids), batch_size):
            response = ec2.terminate_instances(
                InstanceIds=instance_ids[batch:batch + batch_size])
            terminated_instance_ids.extend([
                instance['InstanceId']
                for instance in response['TerminatingInstances']
            ])

        # It's possible that not every instance is terminated.  The most likely cause for this
        # is that AWS terminated the instance in between getting its status and the terminate_instances
        # request.  This is probably fine but let's log a warning just in case.
        missing_instances = set(instance_ids) - set(terminated_instance_ids)
        if missing_instances:
            logger.warning(
                'Some instances could not be terminated; they were probably killed previously'
            )
            logger.warning(f'Missing instances: {list(missing_instances)}')
        terminated_capacity = sum(instance_weights[i] for i in instance_ids)

        logger.info(
            f'{self.id} terminated weight: {terminated_capacity}; instances: {terminated_instance_ids}'
        )
        return terminated_instance_ids
    def __init__(self, config, simulator):
        """
        :param config: a configuration dictionary that follows the SFR launch configuration schema.  Not all values
            needed for the SFR config are required here.  Specifically, we require the following elements:
            {
                'LaunchSpecifications': [
                    {
                        'InstanceType': AWS EC2 instance type name,
                        'SubnetId': Subnet the instance should be launched in (should map to a region in common/aws.py),
                        'SpotPrice': How much, in terms of price per unit capacity, to bid in this market,
                        'WeightedCapacity': How much to weight instances in this market by when calculating capacity
                    },
                    ...
                ]
            }
        """
        SimulatedAWSCluster.__init__(self, simulator)
        AWSResourceGroup.__init__(self, f'ssfr-{uuid4()}')
        self._instance_types = {}
        for spec in config['LaunchSpecifications']:
            bid_price = float(spec['SpotPrice']) * spec['WeightedCapacity']
            market = get_instance_market(spec)
            self._instance_types[market] = SpotMarketConfig(
                bid_price, spec['WeightedCapacity'])

        self.__target_capacity = 0
        self.allocation_strategy = config['AllocationStrategy']
        if self.allocation_strategy != 'diversified':
            raise NotImplementedError(
                f'{self.allocation_strategy} not supported')
예제 #3
0
    def get_instance_metadatas(
        self,
        state_filter: Optional[Collection[str]] = None
    ) -> Sequence[InstanceMetadata]:
        instance_metadatas = []
        for instance_dict in ec2_describe_instances(
                instance_ids=self.instance_ids):
            aws_state = instance_dict['State']['Name']
            if state_filter and aws_state not in state_filter:
                continue

            instance_market = get_instance_market(instance_dict)
            instance_ip = instance_dict.get('PrivateIpAddress')
            hostname = gethostbyaddr(instance_ip)[0] if instance_ip else None

            metadata = InstanceMetadata(
                group_id=self.id,
                hostname=hostname,
                instance_id=instance_dict['InstanceId'],
                ip_address=instance_ip,
                is_stale=(instance_dict['InstanceId']
                          in self.stale_instance_ids),
                market=instance_market,
                state=aws_state,
                uptime=(arrow.now() - arrow.get(instance_dict['LaunchTime'])),
                weight=self.market_weight(instance_market),
            )
            instance_metadatas.append(metadata)
        return instance_metadatas
예제 #4
0
 def _instances_by_market(self):
     """ Responses from this API call are cached to prevent hitting any AWS request limits """
     instance_dict: Mapping[InstanceMarket,
                            List[Mapping]] = defaultdict(list)
     for instance in ec2_describe_instances(self.instance_ids):
         instance_dict[get_instance_market(instance)].append(instance)
     return instance_dict
    def _generate_market_weights(self) -> Mapping[InstanceMarket, float]:
        if not self._configuration:
            return {}

        return {
            get_instance_market(spec): spec['WeightedCapacity']
            for spec in self._configuration['SpotFleetRequestConfig']
            ['LaunchSpecifications']
        }
예제 #6
0
    def _make_autoscaler(self, autoscaler_config_file: str) -> None:
        fetch_count, signal_count = setup_signals_environment(
            self.metadata.pool, self.metadata.scheduler)
        signal_dir = os.path.join(os.path.expanduser('~'), '.cache',
                                  'clusterman')

        endpoint_url = staticconf.read_string('aws.endpoint_url',
                                              '').format(svc='s3')
        env = os.environ.copy()
        if endpoint_url:
            env['AWS_ENDPOINT_URL_ARGS'] = f'--endpoint-url {endpoint_url}'

        for i in range(fetch_count):
            subprocess.run(['fetch_clusterman_signal',
                            str(i), signal_dir],
                           check=True,
                           env=env)
        for i in range(signal_count):
            subprocess.Popen(['run_clusterman_signal',
                              str(i), signal_dir],
                             env=env)

        with open(autoscaler_config_file) as f:
            autoscaler_config = yaml.safe_load(f)
        configs = autoscaler_config.get('configs', [])
        if 'sfrs' in autoscaler_config:
            aws_configs = ec2.describe_spot_fleet_requests(
                SpotFleetRequestIds=autoscaler_config['sfrs'])
            configs.extend([
                config['SpotFleetRequestConfig']
                for config in aws_configs['SpotFleetRequestConfigs']
            ])
        pool_manager = SimulatedPoolManager(self.metadata.cluster,
                                            self.metadata.pool, configs, self)
        metric_values = self.metrics_client.get_metric_values(
            'target_capacity',
            METADATA,
            self.start_time.timestamp,
            # metrics collector runs 1x/min, but we'll try to get five data points in case some data is missing
            self.start_time.shift(minutes=5).timestamp,
            use_cache=False,
            extra_dimensions=get_cluster_dimensions(self.metadata.cluster,
                                                    self.metadata.pool,
                                                    self.metadata.scheduler),
        )
        # take the earliest data point available - this is a Decimal, which doesn't play nicely, so convert to an int
        with patch_join_delay():
            actual_target_capacity = int(
                metric_values['target_capacity'][0][1])
            pool_manager.modify_target_capacity(actual_target_capacity,
                                                force=True,
                                                prune=False)

        for config in configs:
            for spec in config['LaunchSpecifications']:
                self.markets |= {get_instance_market(spec)}
        self.autoscaler = Autoscaler(
            self.metadata.cluster,
            self.metadata.pool,
            self.metadata.scheduler,
            [self.metadata.pool],
            pool_manager=pool_manager,
            metrics_client=self.metrics_client,
            monitoring_enabled=False,  # no sensu alerts during simulations
        )