def autoscale_marathon_instance(marathon_service_config, marathon_tasks,
                                mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    if len(marathon_tasks) != current_instances:
        write_to_log(
            config=marathon_service_config,
            line=
            'Delaying scaling as marathon is either waiting for resources or is delayed'
        )
        return
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    autoscaling_metrics_provider = get_service_metrics_provider(
        autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY))
    autoscaling_decision_policy = get_decision_policy(
        autoscaling_params.pop(DECISION_POLICY_KEY))

    utilization = autoscaling_metrics_provider(
        marathon_service_config=marathon_service_config,
        marathon_tasks=marathon_tasks,
        mesos_tasks=mesos_tasks,
        **autoscaling_params)
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params.pop('setpoint'),
        current_instances=current_instances,
    )

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        error=error,
        min_instances=marathon_service_config.get_min_instances(),
        max_instances=marathon_service_config.get_max_instances(),
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        **autoscaling_params)

    new_instance_count = marathon_service_config.limit_instance_count(
        current_instances + autoscaling_amount)
    if new_instance_count != current_instances:
        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' %
            (current_instances, new_instance_count, humanize_error(error)),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' %
            (current_instances, humanize_error(error)),
            level='debug',
        )
Esempio n. 2
0
def autoscale_marathon_instance(marathon_service_config, marathon_tasks, mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    if len(marathon_tasks) != current_instances:
        write_to_log(config=marathon_service_config,
                     line='Delaying scaling as marathon is either waiting for resources or is delayed')
        return
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    autoscaling_metrics_provider = get_service_metrics_provider(autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY))
    autoscaling_decision_policy = get_decision_policy(autoscaling_params.pop(DECISION_POLICY_KEY))

    utilization = autoscaling_metrics_provider(marathon_service_config, marathon_tasks,
                                               mesos_tasks, **autoscaling_params)
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params.pop('setpoint'),
        current_instances=current_instances,
    )

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        error=error,
        min_instances=marathon_service_config.get_min_instances(),
        max_instances=marathon_service_config.get_max_instances(),
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        **autoscaling_params
    )

    new_instance_count = marathon_service_config.limit_instance_count(current_instances + autoscaling_amount)
    if new_instance_count != current_instances:
        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' % (
                current_instances, new_instance_count, humanize_error(error)),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)),
            level='debug',
        )
Esempio n. 3
0
def update_autoscaler_count(request):
    service = request.swagger_data.get("service")
    instance = request.swagger_data.get("instance")
    desired_instances = request.swagger_data.get(
        "json_body")["desired_instances"]
    if not isinstance(desired_instances, int):
        error_message = 'The provided body does not have an integer value for "desired_instances": {}'.format(
            request.swagger_data.get("json_body"))
        raise ApiFailure(error_message, 500)

    try:
        service_config = load_marathon_service_config(
            service=service,
            instance=instance,
            cluster=settings.cluster,
            soa_dir=settings.soa_dir,
            load_deployments=False,
        )
    except Exception:
        error_message = f"Unable to load service config for {service}.{instance}"
        raise ApiFailure(error_message, 404)

    max_instances = service_config.get_max_instances()
    if max_instances is None:
        error_message = f"Autoscaling is not enabled for {service}.{instance}"
        raise ApiFailure(error_message, 404)

    min_instances = service_config.get_min_instances()

    # Dump whatever number from the client to zk. get_instances() will limit
    # readings from zk to [min_instances, max_instances].
    set_instances_for_marathon_service(service=service,
                                       instance=instance,
                                       instance_count=desired_instances)
    status = "SUCCESS"
    if desired_instances > max_instances:
        desired_instances = max_instances
        status = (
            "WARNING desired_instances is greater than max_instances %d" %
            max_instances)
    elif desired_instances < min_instances:
        desired_instances = min_instances
        status = ("WARNING desired_instances is less than min_instances %d" %
                  min_instances)

    response_body = {"desired_instances": desired_instances, "status": status}
    return Response(json_body=response_body, status_code=202)
Esempio n. 4
0
def update_autoscaler_count(request):
    service = request.swagger_data.get('service')
    instance = request.swagger_data.get('instance')
    desired_instances = request.swagger_data.get(
        'json_body')['desired_instances']

    try:
        service_config = load_marathon_service_config(
            service=service,
            instance=instance,
            cluster=settings.cluster,
            soa_dir=settings.soa_dir,
            load_deployments=False,
        )
    except Exception:
        error_message = 'Unable to load service config for %s.%s' % (service,
                                                                     instance)
        raise ApiFailure(error_message, 404)

    max_instances = service_config.get_max_instances()
    if max_instances is None:
        error_message = 'Autoscaling is not enabled for %s.%s' % (service,
                                                                  instance)
        raise ApiFailure(error_message, 404)

    min_instances = service_config.get_min_instances()

    # Dump whatever number from the client to zk. get_instances() will limit
    # readings from zk to [min_instances, max_instances].
    set_instances_for_marathon_service(service=service,
                                       instance=instance,
                                       instance_count=desired_instances)
    status = 'SUCCESS'
    if desired_instances > max_instances:
        desired_instances = max_instances
        status = 'WARNING desired_instances is greater than max_instances %d' % max_instances
    elif desired_instances < min_instances:
        desired_instances = min_instances
        status = 'WARNING desired_instances is less than min_instances %d' % min_instances

    response_body = {'desired_instances': desired_instances, 'status': status}
    return Response(json_body=response_body, status_code=202)
Esempio n. 5
0
def update_autoscaler_count(request):
    service = request.swagger_data.get('service')
    instance = request.swagger_data.get('instance')
    desired_instances = request.swagger_data.get('json_body')['desired_instances']

    try:
        service_config = load_marathon_service_config(
            service=service,
            instance=instance,
            cluster=settings.cluster,
            soa_dir=settings.soa_dir,
            load_deployments=False,
        )
    except Exception:
        error_message = 'Unable to load service config for %s.%s' % (service, instance)
        raise ApiFailure(error_message, 404)

    max_instances = service_config.get_max_instances()
    if max_instances is None:
        error_message = 'Autoscaling is not enabled for %s.%s' % (service, instance)
        raise ApiFailure(error_message, 404)

    min_instances = service_config.get_min_instances()

    # Dump whatever number from the client to zk. get_instances() will limit
    # readings from zk to [min_instances, max_instances].
    set_instances_for_marathon_service(service=service, instance=instance, instance_count=desired_instances)
    status = 'SUCCESS'
    if desired_instances > max_instances:
        desired_instances = max_instances
        status = 'WARNING desired_instances is greater than max_instances %d' % max_instances
    elif desired_instances < min_instances:
        desired_instances = min_instances
        status = 'WARNING desired_instances is less than min_instances %d' % min_instances

    response_body = {'desired_instances': desired_instances, 'status': status}
    return Response(json_body=response_body, status_code=202)
Esempio n. 6
0
def autoscale_marathon_instance(marathon_service_config, system_paasta_config,
                                marathon_tasks, mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    task_data_insufficient = is_task_data_insufficient(marathon_service_config,
                                                       marathon_tasks,
                                                       current_instances)
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    log_utilization_data = {}
    utilization = get_utilization(
        marathon_service_config=marathon_service_config,
        system_paasta_config=system_paasta_config,
        autoscaling_params=autoscaling_params,
        log_utilization_data=log_utilization_data,
        marathon_tasks=marathon_tasks,
        mesos_tasks=mesos_tasks,
    )
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params['setpoint'],
        current_instances=current_instances,
    )
    new_instance_count = get_new_instance_count(
        utilization=utilization,
        error=error,
        autoscaling_params=autoscaling_params,
        current_instances=current_instances,
        marathon_service_config=marathon_service_config,
        num_healthy_instances=len(marathon_tasks),
    )

    safe_downscaling_threshold = int(current_instances * 0.7)
    if new_instance_count != current_instances:
        if new_instance_count < current_instances and task_data_insufficient:
            write_to_log(
                config=marathon_service_config,
                line=
                'Delaying scaling *down* as we found too few healthy tasks running in marathon. '
                'This can happen because tasks are delayed/waiting/unhealthy or because we are '
                'waiting for tasks to be killed. Will wait for sufficient healthy tasks before '
                'we make a decision to scale down.',
            )
            return
        if new_instance_count == safe_downscaling_threshold:
            write_to_log(
                config=marathon_service_config,
                line='Autoscaler clamped: %s' % str(log_utilization_data),
                level='debug',
            )

        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' % (
                current_instances,
                new_instance_count,
                humanize_error(error),
            ),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' %
            (current_instances, humanize_error(error)),
            level='debug',
        )
    meteorite_dims = {
        'service_name': marathon_service_config.service,
        'decision_policy': autoscaling_params[DECISION_POLICY_KEY],
        'paasta_cluster': marathon_service_config.cluster,
        'instance_name': marathon_service_config.instance,
    }
    if yelp_meteorite:
        gauge = yelp_meteorite.create_gauge('paasta.service.instances',
                                            meteorite_dims)
        gauge.set(new_instance_count)
        gauge = yelp_meteorite.create_gauge('paasta.service.max_instances',
                                            meteorite_dims)
        gauge.set(marathon_service_config.get_max_instances())
        gauge = yelp_meteorite.create_gauge('paasta.service.min_instances',
                                            meteorite_dims)
        gauge.set(marathon_service_config.get_min_instances())
Esempio n. 7
0
def autoscale_marathon_instance(marathon_service_config, marathon_tasks,
                                mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    too_many_instances_running = len(marathon_tasks) > int(
        (1 + MAX_TASK_DELTA) * current_instances)
    too_few_instances_running = len(marathon_tasks) < int(
        (1 - MAX_TASK_DELTA) * current_instances)
    if too_many_instances_running or too_few_instances_running:
        if current_instances < marathon_service_config.get_min_instances():
            write_to_log(
                config=marathon_service_config,
                line=
                'Scaling from %d to %d instances because we are below min_instances'
                % (current_instances,
                   marathon_service_config.get_min_instances()))
            set_instances_for_marathon_service(
                service=marathon_service_config.service,
                instance=marathon_service_config.instance,
                instance_count=marathon_service_config.get_min_instances())

        else:
            write_to_log(
                config=marathon_service_config,
                line=
                'Delaying scaling as we found too many or too few tasks running in marathon. '
                'This can happen because tasks are delayed/waiting/unhealthy or because we are '
                'waiting for tasks to be killed.')
        return
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    autoscaling_metrics_provider = get_service_metrics_provider(
        autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY))
    autoscaling_decision_policy = get_decision_policy(
        autoscaling_params.pop(DECISION_POLICY_KEY))

    log_utilization_data = {}

    utilization = autoscaling_metrics_provider(
        marathon_service_config=marathon_service_config,
        marathon_tasks=marathon_tasks,
        mesos_tasks=mesos_tasks,
        log_utilization_data=log_utilization_data,
        **autoscaling_params)
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params.pop('setpoint'),
        current_instances=current_instances,
    )

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        error=error,
        min_instances=marathon_service_config.get_min_instances(),
        max_instances=marathon_service_config.get_max_instances(),
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        **autoscaling_params)

    # Limit downscaling by 30% of current_instances until we find out what is
    # going on in such situations
    safe_downscaling_threshold = int(current_instances * 0.7)
    new_instance_count = max(current_instances + autoscaling_amount,
                             safe_downscaling_threshold)

    new_instance_count = marathon_service_config.limit_instance_count(
        new_instance_count)
    if new_instance_count != current_instances:
        if new_instance_count == safe_downscaling_threshold:
            write_to_log(
                config=marathon_service_config,
                line='Autoscaler clamped: %s' % str(log_utilization_data),
                level='debug',
            )

        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' %
            (current_instances, new_instance_count, humanize_error(error)),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' %
            (current_instances, humanize_error(error)),
            level='debug',
        )
Esempio n. 8
0
def autoscale_marathon_instance(marathon_service_config, marathon_tasks,
                                mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    task_data_insufficient = is_task_data_insufficient(marathon_service_config,
                                                       marathon_tasks,
                                                       current_instances)
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    log_utilization_data = {}
    utilization = get_utilization(
        marathon_service_config=marathon_service_config,
        autoscaling_params=autoscaling_params,
        log_utilization_data=log_utilization_data,
        marathon_tasks=marathon_tasks,
        mesos_tasks=mesos_tasks)
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params['setpoint'],
        current_instances=current_instances,
    )
    new_instance_count = get_new_instance_count(
        utilization=utilization,
        error=error,
        autoscaling_params=autoscaling_params,
        current_instances=current_instances,
        marathon_service_config=marathon_service_config,
        num_healthy_instances=len(marathon_tasks),
    )

    safe_downscaling_threshold = int(current_instances * 0.7)
    if new_instance_count != current_instances:
        if new_instance_count < current_instances and task_data_insufficient:
            write_to_log(
                config=marathon_service_config,
                line=
                'Delaying scaling *down* as we found too few healthy tasks running in marathon. '
                'This can happen because tasks are delayed/waiting/unhealthy or because we are '
                'waiting for tasks to be killed. Will wait for sufficient healthy tasks before '
                'we make a decision to scale down.')
            return
        if new_instance_count == safe_downscaling_threshold:
            write_to_log(
                config=marathon_service_config,
                line='Autoscaler clamped: %s' % str(log_utilization_data),
                level='debug',
            )

        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' %
            (current_instances, new_instance_count, humanize_error(error)),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' %
            (current_instances, humanize_error(error)),
            level='debug',
        )
Esempio n. 9
0
def autoscale_marathon_instance(
    marathon_service_config: MarathonServiceConfig,
    system_paasta_config: SystemPaastaConfig,
    marathon_tasks: Sequence[MarathonTask],
    mesos_tasks: Sequence[Task],
) -> None:
    try:
        with create_autoscaling_lock(marathon_service_config.service,
                                     marathon_service_config.instance):
            current_instances = marathon_service_config.get_instances()
            task_data_insufficient = is_task_data_insufficient(
                marathon_service_config=marathon_service_config,
                marathon_tasks=marathon_tasks,
                current_instances=current_instances,
            )
            autoscaling_params = marathon_service_config.get_autoscaling_params(
            )
            log_utilization_data: Mapping = {}
            utilization = get_utilization(
                marathon_service_config=marathon_service_config,
                system_paasta_config=system_paasta_config,
                autoscaling_params=autoscaling_params,
                log_utilization_data=log_utilization_data,
                marathon_tasks=marathon_tasks,
                mesos_tasks=mesos_tasks,
            )
            error = get_error_from_utilization(
                utilization=utilization,
                setpoint=autoscaling_params["setpoint"],
                current_instances=current_instances,
            )
            num_healthy_instances = len(marathon_tasks)
            new_instance_count = get_new_instance_count(
                utilization=utilization,
                error=error,
                autoscaling_params=autoscaling_params,
                current_instances=current_instances,
                marathon_service_config=marathon_service_config,
                num_healthy_instances=num_healthy_instances,
                persist_data=(not task_data_insufficient),
            )
            safe_downscaling_threshold = int(current_instances * 0.7)
            _record_autoscaling_decision(
                marathon_service_config=marathon_service_config,
                autoscaling_params=autoscaling_params,
                utilization=utilization,
                log_utilization_data=log_utilization_data,
                error=error,
                current_instances=current_instances,
                num_healthy_instances=num_healthy_instances,
                new_instance_count=new_instance_count,
                safe_downscaling_threshold=safe_downscaling_threshold,
                task_data_insufficient=task_data_insufficient,
            )
            if new_instance_count != current_instances:
                if new_instance_count < current_instances and task_data_insufficient:
                    write_to_log(
                        config=marathon_service_config,
                        line=
                        "Delaying scaling *down* as we found too few healthy tasks running in marathon. "
                        "This can happen because tasks are delayed/waiting/unhealthy or because we are "
                        "waiting for tasks to be killed. Will wait for sufficient healthy tasks before "
                        "we make a decision to scale down.",
                        level="debug",
                    )
                    return
                else:
                    set_instances_for_marathon_service(
                        service=marathon_service_config.service,
                        instance=marathon_service_config.instance,
                        instance_count=new_instance_count,
                    )
                    write_to_log(
                        config=marathon_service_config,
                        line="Scaling from %d to %d instances (%s)" % (
                            current_instances,
                            new_instance_count,
                            humanize_error(error),
                        ),
                        level="event",
                    )
    except LockHeldException:
        log.warning(
            "Skipping autoscaling run for {service}.{instance} because the lock is held"
            .format(
                service=marathon_service_config.service,
                instance=marathon_service_config.instance,
            ))