Beispiel #1
0
    def _get_historical_weighted_resource_value(self) -> ClustermanResources:
        """ Compute the weighted value of each type of resource in the cluster

        returns: a ClustermanResources object with the weighted resource value, or 0 if it couldn't be determined
        """
        capacity_history = self._get_smoothed_non_zero_metadata(
            'non_orphan_fulfilled_capacity',
            time_start=arrow.now().shift(weeks=-1).timestamp,
            time_end=arrow.now().timestamp,
        )
        if not capacity_history:
            return ClustermanResources()
        time_start, time_end, non_orphan_fulfilled_capacity = capacity_history

        weighted_resource_dict: MutableMapping[str, float] = {}
        for resource in ClustermanResources._fields:
            resource_history = self._get_smoothed_non_zero_metadata(
                f'{resource}_total',
                time_start=time_start,
                time_end=time_end,
            )
            if not resource_history:
                weighted_resource_dict[resource] = 0
            else:
                weighted_resource_dict[resource] = resource_history[
                    2] / non_orphan_fulfilled_capacity

        return ClustermanResources(**weighted_resource_dict)
Beispiel #2
0
class AgentMetadata(NamedTuple):
    agent_id: str = ''
    allocated_resources: ClustermanResources = ClustermanResources()
    batch_task_count: int = 0
    state: AgentState = AgentState.UNKNOWN
    task_count: int = 0
    total_resources: ClustermanResources = ClustermanResources()
 def get_cluster_allocated_resources(self) -> ClustermanResources:
     """Get all allocated resources for the cluster"""
     allocated_resources = {
         resource: self.get_resource_allocation(resource)
         for resource in ClustermanResources._fields
     }
     return ClustermanResources(**allocated_resources)
def test_get_resource_request_only_pending_pods(pending_pods):
    assert _get_resource_request(ClustermanResources(), pending_pods) == SignalResourceRequest(
        cpus=6,
        mem=1000,
        disk=0,
        gpus=0,
    )
Beispiel #5
0
 def _get_cluster_total_resources(self) -> ClustermanResources:
     total_resources = {
         resource:
         self.pool_manager.cluster_connector.get_resource_total(resource)
         for resource in ClustermanResources._fields
     }
     return ClustermanResources(**total_resources)
Beispiel #6
0
def total_node_resources(node: KubernetesNode) -> ClustermanResources:
    return ClustermanResources(
        cpus=ResourceParser.cpus(node.status.capacity),
        mem=ResourceParser.mem(node.status.capacity),
        disk=ResourceParser.disk(node.status.capacity),
        gpus=ResourceParser.gpus(node.status.capacity),
    )
 def get_cluster_total_resources(self) -> ClustermanResources:
     """Get the total available resources for the cluster"""
     total_resources = {
         resource: self.get_resource_total(resource)
         for resource in ClustermanResources._fields
     }
     return ClustermanResources(**total_resources)
Beispiel #8
0
def total_agent_resources(agent: MesosAgentDict) -> ClustermanResources:
    resources = agent.get('resources', {})
    return ClustermanResources(
        cpus=resources.get('cpus', 0),
        mem=resources.get('mem', 0),
        disk=resources.get('disk', 0),
        gpus=resources.get('gpus', 0),
    )
Beispiel #9
0
 def _get_cluster_allocated_resources(self) -> ClustermanResources:
     allocated_resources = {
         resource:
         self.pool_manager.cluster_connector.get_resource_allocation(
             resource)
         for resource in ClustermanResources._fields
     }
     return ClustermanResources(**allocated_resources)
Beispiel #10
0
def allocated_agent_resources(agent_dict: MesosAgentDict) -> ClustermanResources:
    used_resources = agent_dict.get('used_resources', {})
    return ClustermanResources(
        cpus=used_resources.get('cpus', 0),
        mem=used_resources.get('mem', 0),
        disk=used_resources.get('disk', 0),
        gpus=used_resources.get('gpus', 0),
    )
Beispiel #11
0
def total_node_resources(
        node: KubernetesNode,
        excluded_pods: List[KubernetesPod]) -> ClustermanResources:
    base_total = ClustermanResources(
        cpus=ResourceParser.cpus(node.status.allocatable),
        mem=ResourceParser.mem(node.status.allocatable),
        disk=ResourceParser.disk(node.status.allocatable),
        gpus=ResourceParser.gpus(node.status.allocatable),
    )
    excluded_resources = allocated_node_resources(excluded_pods)
    return base_total - excluded_resources
Beispiel #12
0
    def test_scale_most_constrained_resource(self, mock_autoscaler):
        resource_request = SignalResourceRequest(cpus=500, mem=30000, disk=19000, gpus=0)
        resource_totals = ClustermanResources(cpus=1000, mem=50000, disk=20000, gpus=0)
        mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 100
        mock_autoscaler.pool_manager.cluster_connector.get_cluster_total_resources.return_value = resource_totals
        new_target_capacity = mock_autoscaler._compute_target_capacity(resource_request)

        # disk would be the most constrained resource, so we should scale the target_capacity (100) by an amount
        # such that requested/(total*scale_factor) = setpoint
        expected_new_target_capacity = 100 * 19000 / (20000 * 0.7)
        assert new_target_capacity == pytest.approx(expected_new_target_capacity)
Beispiel #13
0
    def test_excluded_resources(self, mock_autoscaler):
        resource_request = SignalResourceRequest(cpus=500, mem=30000, disk=19000, gpus=0)
        resource_totals = ClustermanResources(cpus=1000, mem=50000, disk=20000, gpus=0)
        mock_autoscaler.autoscaling_config = AutoscalingConfig(['disk'], 0.7, 0.1)
        mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 100
        mock_autoscaler.pool_manager.cluster_connector.get_cluster_total_resources.return_value = resource_totals
        new_target_capacity = mock_autoscaler._compute_target_capacity(resource_request)

        # disk would be the most constrained resource, but it's excluded, so we scale on the next most constrained (mem)
        expected_new_target_capacity = 100 * 30000 / (50000 * 0.7)
        assert new_target_capacity == pytest.approx(expected_new_target_capacity)
Beispiel #14
0
def test_get_historical_weighted_resource_value(mock_autoscaler):
    mock_autoscaler._get_smoothed_non_zero_metadata = mock.Mock(side_effect=[
        (100, 200, 78),   # historical non_zero_fulfilled_capacity
        (100, 200, 20),   # cpus
        None,             # mem
        (100, 200, 0.1),  # disk
        (100, 200, 1),    # gpus
    ])
    assert mock_autoscaler._get_historical_weighted_resource_value() == ClustermanResources(
        cpus=20 / 78,
        mem=0,
        disk=0.1 / 78,
        gpus=1 / 78,
    )
Beispiel #15
0
def total_pod_resources(pod: KubernetesPod) -> ClustermanResources:
    return ClustermanResources(
        cpus=sum(
            ResourceParser.cpus(c.resources.requests)
            for c in pod.spec.containers),
        mem=sum(
            ResourceParser.mem(c.resources.requests)
            for c in pod.spec.containers),
        disk=sum(
            ResourceParser.disk(c.resources.requests)
            for c in pod.spec.containers),
        gpus=sum(
            ResourceParser.gpus(c.resources.requests)
            for c in pod.spec.containers),
    )
Beispiel #16
0
def test_get_options_for_instance_type(mock_asrg):
    mock_asrg._group_config['AvailabilityZones'] = ['us-west-1a', 'us-west-2a']
    result = mock_asrg._get_options_for_instance_type('m5.4xlarge')
    assert len(result) == 2
    assert all([
        r.agent.total_resources == ClustermanResources(
            cpus=16,
            mem=64 * 1024,
            disk=DEFAULT_VOLUME_SIZE_GB * 1024,
            gpus=0,
        ) for r in result
    ])
    assert result[0].instance.market == InstanceMarket('m5.4xlarge',
                                                       'us-west-1a')
    assert result[1].instance.market == InstanceMarket('m5.4xlarge',
                                                       'us-west-2a')
Beispiel #17
0
    def test_current_target_capacity_no_historical_data(self, mock_autoscaler):
        mock_autoscaler.pool_manager.cluster_connector.get_resource_total.return_value = 0
        mock_autoscaler.pool_manager.target_capacity = 0
        mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0
        mock_autoscaler._get_historical_weighted_resource_value = mock.Mock(
            return_value=ClustermanResources())

        new_target_capacity = mock_autoscaler._compute_target_capacity({
            'cpus':
            7,
            'mem':
            400,
            'disk':
            70,
            'gpus':
            0
        })
        assert new_target_capacity == 1
    def _get_options_for_instance_type(
        self,
        instance_type: str,
        weight: Optional[float] = None,
    ) -> List[ClusterNodeMetadata]:
        """ Generate a list of possible ClusterNode types that could be added to this ASG,
        given a particular instance type """

        options = []
        az_options = self._group_config['AvailabilityZones']
        for az in az_options:
            instance_market = InstanceMarket(instance_type, az)
            weight = weight or self.market_weight(instance_market)
            options.append(ClusterNodeMetadata(
                agent=AgentMetadata(total_resources=ClustermanResources.from_instance_type(instance_type)),
                instance=InstanceMetadata(market=instance_market, weight=weight),
            ))
        return options
    def _get_agent_metadata(self, instance_ip: str) -> AgentMetadata:
        for c in self.simulator.aws_clusters:
            for i in c.instances.values():
                if instance_ip == i.ip_address:
                    return AgentMetadata(
                        agent_id=str(uuid.uuid4()),
                        state=(
                            AgentState.ORPHANED
                            if self.simulator.current_time < i.join_time
                            else AgentState.IDLE
                        ),
                        total_resources=ClustermanResources(
                            cpus=i.resources.cpus,
                            mem=i.resources.mem * 1000,
                            disk=(i.resources.disk or staticconf.read_int('ebs_volume_size', 0)) * 1000,
                            gpus=(i.resources.gpus),
                        )
                    )

        # if we don't know the given IP then it's orphaned
        return AgentMetadata(state=AgentState.ORPHANED)
Beispiel #20
0
def allocated_node_resources(pods: List[KubernetesPod]) -> ClustermanResources:
    cpus = mem = disk = gpus = 0
    for pod in pods:
        cpus += sum(
            ResourceParser.cpus(c.resources.requests)
            for c in pod.spec.containers)
        mem += sum(
            ResourceParser.mem(c.resources.requests)
            for c in pod.spec.containers)
        disk += sum(
            ResourceParser.disk(c.resources.requests)
            for c in pod.spec.containers)
        gpus += sum(
            ResourceParser.gpus(c.resources.requests)
            for c in pod.spec.containers)

    return ClustermanResources(
        cpus=cpus,
        mem=mem,
        disk=disk,
        gpus=gpus,
    )
Beispiel #21
0
    def _get_most_constrained_resource_for_request(
        self,
        resource_request: SignalResponseDict,
        cluster_total_resources: ClustermanResources,
    ) -> Tuple[str, float]:
        """Determine what would be the most constrained resource if were to fulfill a resource_request without scaling
        the cluster.

        :param resource_rquest: dictionary of resource name (cpu, mem, disk) to the requested quantity of that resource
        :param cluster_total_resources: the currently available resources in the cluster
        :returns: a tuple of the most constrained resource name and its utilization percentage if the provided request
            were to be fulfilled
        """
        requested_resource_usage_pcts = {}
        for resource, resource_total in cluster_total_resources._asdict(
        ).items():
            resource_request_value = resource_request.get(resource)
            if resource_request_value is None:
                continue

            if resource in self.autoscaling_config.excluded_resources:
                logger.info(
                    f'Signal requested {resource_total} {resource} but it is excluded from scaling decisions'
                )
                continue

            if resource_total == 0:
                if resource_request_value > 0:
                    raise ResourceRequestError(
                        f'Signal requested {resource_request_value} for {resource} '
                        "but the cluster doesn't have any of that resource")
                requested_resource_usage_pcts[resource] = 0
            else:
                requested_resource_usage_pcts[
                    resource] = resource_request_value / resource_total
        return max(requested_resource_usage_pcts.items(), key=lambda x: x[1])
def allocated_resources():
    return ClustermanResources(cpus=150, mem=1000, disk=500, gpus=0)
Beispiel #23
0
def create_k8s_autoscaler(context,
                          prevent_scale_down_after_capacity_loss=False):
    behave.use_fixture(autoscaler_patches, context)
    context.mock_cluster_connector.__class__ = KubernetesClusterConnector
    context.mock_cluster_connector.get_cluster_allocated_resources.return_value = ClustermanResources(
        cpus=context.allocated_cpus, )
    context.mock_cluster_connector._pending_pods = []
    if float(context.pending_cpus) > 0:
        context.mock_cluster_connector.get_unschedulable_pods = \
            lambda: KubernetesClusterConnector.get_unschedulable_pods(context.mock_cluster_connector)
        context.mock_cluster_connector._get_pod_unschedulable_reason.side_effect = lambda pod: (
            PodUnschedulableReason.InsufficientResources
            if pod.metadata.name == 'pod1' else PodUnschedulableReason.Unknown)
        context.mock_cluster_connector._pending_pods = [
            V1Pod(
                metadata=V1ObjectMeta(name='pod1'),
                status=V1PodStatus(
                    phase='Pending',
                    conditions=[
                        V1PodCondition(status='False',
                                       type='PodScheduled',
                                       reason='Unschedulable')
                    ],
                ),
                spec=V1PodSpec(containers=[
                    V1Container(name='container1',
                                resources=V1ResourceRequirements(
                                    requests={'cpu': context.pending_cpus})),
                ]),
            ),
            V1Pod(
                metadata=V1ObjectMeta(name='pod2'),
                status=V1PodStatus(
                    phase='Pending',
                    conditions=[
                        V1PodCondition(status='False',
                                       type='PodScheduled',
                                       reason='Unschedulable')
                    ],
                ),
                spec=V1PodSpec(containers=[
                    V1Container(name='container1',
                                resources=V1ResourceRequirements(
                                    requests={'cpu': context.pending_cpus})),
                ]),
            ),
        ]

    context.autoscaler = Autoscaler(
        cluster='kube-test',
        pool='bar',
        apps=['bar'],
        scheduler='kubernetes',
        metrics_client=mock.Mock(),
        monitoring_enabled=False,
    )

    if prevent_scale_down_after_capacity_loss:
        context.autoscaler.autoscaling_config = AutoscalingConfig(
            excluded_resources=[],
            setpoint=0.7,
            target_capacity_margin=0.1,
            prevent_scale_down_after_capacity_loss=True,
            instance_loss_threshold=0)
Beispiel #24
0
 def change_allocated_cpus():
     reload_fn()
     context.mock_cluster_connector.get_cluster_allocated_resources.return_value = ClustermanResources(
         cpus=context.allocated_cpus, )
Beispiel #25
0
def autoscaler_patches(context):
    behave.use_fixture(boto_patches, context)
    resource_groups = {}
    for i in range(context.rgnum):
        resource_groups[f'rg{i}'] = mock.Mock(
            spec=SpotFleetResourceGroup,
            id=f'rg{i}',
            target_capacity=context.target_capacity / context.rgnum,
            fulfilled_capacity=context.target_capacity / context.rgnum,
            is_stale=False,
            min_capacity=0,
            max_capacity=float('inf'),
        )

    resource_totals = ClustermanResources(cpus=context.cpus,
                                          mem=context.mem,
                                          disk=context.disk,
                                          gpus=context.gpus)

    with staticconf.testing.PatchConfiguration(
        {'autoscaling': {
            'default_signal_role': 'bar'
        }}, ), mock.patch(
            'clusterman.autoscaler.autoscaler.get_monitoring_client',
        ), mock.patch(
            'clusterman.aws.util.SpotFleetResourceGroup.load',
            return_value=resource_groups,
        ), mock.patch(
            'clusterman.autoscaler.pool_manager.PoolManager',
            wraps=PoolManager,
        ), mock.patch(
            'clusterman.autoscaler.autoscaler.PoolManager.prune_excess_fulfilled_capacity',
        ), mock.patch(
            'clusterman.autoscaler.pool_manager.ClusterConnector.load',
        ) as mock_cluster_connector, mock.patch(
            'clusterman.autoscaler.autoscaler.PoolManager._calculate_non_orphan_fulfilled_capacity',
            return_value=context.target_capacity,
        ), mock.patch(
            'clusterman.signals.external_signal.ExternalSignal._connect_to_signal_process',
        ), mock.patch(
            'clusterman.signals.external_signal.get_metrics_for_signal',
        ) as mock_metrics, mock_dynamodb2():
        dynamodb.create_table(
            TableName=CLUSTERMAN_STATE_TABLE,
            KeySchema=[
                {
                    'AttributeName': 'state',
                    'KeyType': 'HASH'
                },
                {
                    'AttributeName': 'entity',
                    'KeyType': 'SORT'
                },
            ],
            AttributeDefinitions=[
                {
                    'AttributeName': 'state',
                    'AttributeType': 'S'
                },
                {
                    'AttributeName': 'entity',
                    'AttributeType': 'S'
                },
            ],
        )
        mock_metrics.return_value = {
        }  # don't know why this is necessary but we get flaky tests if it's not set
        mock_cluster_connector.return_value.get_cluster_total_resources.return_value = resource_totals
        context.mock_cluster_connector = mock_cluster_connector.return_value
        yield
Beispiel #26
0
 def test_single_resource(self, mock_autoscaler, resource, signal_resource, total_resource, expected_capacity):
     mock_autoscaler.pool_manager.target_capacity = 125
     mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 125
     mock_autoscaler.pool_manager.cluster_connector.get_cluster_total_resources.return_value = ClustermanResources(
         cpus=total_resource,
         mem=total_resource,
         disk=total_resource,
         gpus=total_resource,
     )
     new_target_capacity = mock_autoscaler._compute_target_capacity(SignalResourceRequest(
         **{resource: signal_resource},
     ))
     assert new_target_capacity == pytest.approx(expected_capacity)
Beispiel #27
0
    def test_current_target_capacity_no_historical_data(self, mock_autoscaler):
        mock_autoscaler.pool_manager.cluster_connector.get_resource_total.return_value = 0
        mock_autoscaler.pool_manager.target_capacity = 0
        mock_autoscaler.pool_manager.non_orphan_fulfilled_capacity = 0
        mock_autoscaler._get_historical_weighted_resource_value = mock.Mock(return_value=ClustermanResources())

        new_target_capacity = mock_autoscaler._compute_target_capacity(
            SignalResourceRequest(cpus=7, mem=400, disk=70, gpus=0),
        )
        assert new_target_capacity == 1
Beispiel #28
0
def test_get_historical_weighted_resource_value_no_historical_data(
        mock_autoscaler):
    mock_autoscaler._get_smoothed_non_zero_metadata = mock.Mock(
        return_value=None)
    assert mock_autoscaler._get_historical_weighted_resource_value(
    ) == ClustermanResources()