Example #1
0
def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs):
    """
    Gets the average cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from

    :returns: the service's average utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time, _ = zk.get(zk_last_time_path)
            last_cpu_data, _ = zk.get(zk_last_cpu_data)
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    mesos_tasks = {task['id']: task.stats for task in mesos_tasks}
    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {task_id: float(stats.get('cpus_system_time_secs', 0.0) + stats.get(
        'cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items()}

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError("Couldn't get any cpu data from Mesos")

    cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items())

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_last_cpu_data)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_last_cpu_data, str(cpu_data_csv))
        zk.set(zk_last_time_path, str(current_time))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta

    if not utilization:
        raise MetricsProviderNoDataError("""The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run.""")

    task_utilization = utilization.values()
    average_utilization = sum(task_utilization) / len(task_utilization)

    return average_utilization
def get_new_instance_count(
    utilization,
    error,
    autoscaling_params,
    current_instances,
    marathon_service_config,
    num_healthy_instances,
    persist_data: bool,
):
    autoscaling_decision_policy = get_decision_policy(
        autoscaling_params[DECISION_POLICY_KEY])

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        utilization=utilization,
        error=error,
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        num_healthy_instances=num_healthy_instances,
        persist_data=persist_data,
        **autoscaling_params,
    )

    # Limit downscaling by 30% of current_instances until we find out what is
    # going on in such situations
    safe_downscaling_threshold = int(current_instances * 0.7)
    new_instance_count = max(current_instances + autoscaling_amount,
                             safe_downscaling_threshold)

    new_instance_count = marathon_service_config.limit_instance_count(
        new_instance_count)
    return new_instance_count
def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time, _ = zk.get(zk_last_time_path)
            last_cpu_data, _ = zk.get(zk_last_cpu_data)
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    mesos_tasks = {task['id']: task.stats for task in mesos_tasks}
    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {task_id: float(stats.get('cpus_system_time_secs', 0.0) + stats.get(
        'cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items()}

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError("Couldn't get any cpu data from Mesos")

    cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items())

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_last_cpu_data)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_last_cpu_data, str(cpu_data_csv))
        zk.set(zk_last_time_path, str(current_time))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta

    if not utilization:
        raise MetricsProviderNoDataError("""The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run.""")

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)

    return mean_utilization
Example #4
0
def threshold_decision_policy(marathon_service_config, metrics_provider_method, marathon_tasks, mesos_tasks,
                              delay=600, setpoint=0.8, threshold=0.1, **kwargs):
    """
    Decides to autoscale a service up or down if the service utilization exceeds the setpoint
    by a certain threshold.
    """
    zk_last_time_path = '%s/threshold_last_time' % compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    with ZookeeperPool() as zk:
        try:
            last_time, _ = zk.get(zk_last_time_path)
            last_time = float(last_time)
        except NoNodeError:
            last_time = 0.0

    current_time = int(datetime.now().strftime('%s'))
    if current_time - last_time < delay:
        return 0

    error = metrics_provider_method(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs) - setpoint

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_last_time_path, str(current_time))

    if error > threshold:
        return 1
    elif abs(error) > threshold:
        return -1
    else:
        return 0
Example #5
0
def pid_decision_policy(marathon_service_config, metrics_provider_method, marathon_tasks, mesos_tasks,
                        delay=600, setpoint=0.8, **kwargs):
    """
    Uses a PID to determine when to autoscale a service.
    See https://en.wikipedia.org/wiki/PID_controller for more information on PIDs.
    Kp, Ki and Kd are the canonical PID constants, where the output of the PID is:
    Kp * error + Ki * integral(error * dt) + Kd * (d(error) / dt)
    """
    Kp = 0.2
    Ki = 0.2 / delay
    Kd = 0.05 * delay

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_iterm_path = '%s/pid_iterm' % autoscaling_root
    zk_last_error_path = '%s/pid_last_error' % autoscaling_root
    zk_last_time_path = '%s/pid_last_time' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            iterm, _ = zk.get(zk_iterm_path)
            last_error, _ = zk.get(zk_last_error_path)
            last_time, _ = zk.get(zk_last_time_path)
            iterm = float(iterm)
            last_error = float(last_error)
            last_time = float(last_time)
        except NoNodeError:
            iterm = 0.0
            last_error = 0.0
            last_time = 0.0

    if int(datetime.now().strftime('%s')) - last_time < delay:
        return 0

    utilization = metrics_provider_method(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs)
    error = utilization - setpoint

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_iterm_path)
        zk.ensure_path(zk_last_error_path)
        zk.set(zk_iterm_path, str(iterm))
        zk.set(zk_last_error_path, str(error))

    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    iterm = clamp_value(iterm + (Ki * error) * time_delta)

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_iterm_path)
        zk.ensure_path(zk_last_error_path)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_iterm_path, str(iterm))
        zk.set(zk_last_error_path, str(error))
        zk.set(zk_last_time_path, str(current_time))

    return int(round(clamp_value(Kp * error + iterm + Kd * (error - last_error) / time_delta)))
Example #6
0
def autoscale_marathon_instance(marathon_service_config, marathon_tasks,
                                mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    if len(marathon_tasks) != current_instances:
        write_to_log(
            config=marathon_service_config,
            line=
            'Delaying scaling as marathon is either waiting for resources or is delayed'
        )
        return
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    autoscaling_metrics_provider = get_service_metrics_provider(
        autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY))
    autoscaling_decision_policy = get_decision_policy(
        autoscaling_params.pop(DECISION_POLICY_KEY))

    utilization = autoscaling_metrics_provider(marathon_service_config,
                                               marathon_tasks, mesos_tasks,
                                               **autoscaling_params)
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params.pop('setpoint'),
        current_instances=current_instances,
    )

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        error=error,
        min_instances=marathon_service_config.get_min_instances(),
        max_instances=marathon_service_config.get_max_instances(),
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        **autoscaling_params)

    new_instance_count = marathon_service_config.limit_instance_count(
        current_instances + autoscaling_amount)
    if new_instance_count != current_instances:
        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' %
            (current_instances, new_instance_count, humanize_error(error)),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' %
            (current_instances, humanize_error(error)),
            level='debug',
        )
Example #7
0
def pid_decision_policy(marathon_service_config, error, **kwargs):
    """
    Uses a PID to determine when to autoscale a service.
    See https://en.wikipedia.org/wiki/PID_controller for more information on PIDs.
    Kp, Ki and Kd are the canonical PID constants, where the output of the PID is:
    Kp * error + Ki * integral(error * dt) + Kd * (d(error) / dt)
    """
    Kp = 0.2
    Ki = 0.2 / AUTOSCALING_DELAY
    Kd = 0.05 * AUTOSCALING_DELAY

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_iterm_path = '%s/pid_iterm' % autoscaling_root
    zk_last_error_path = '%s/pid_last_error' % autoscaling_root
    zk_last_time_path = '%s/pid_last_time' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            iterm, _ = zk.get(zk_iterm_path)
            last_error, _ = zk.get(zk_last_error_path)
            last_time, _ = zk.get(zk_last_time_path)
            iterm = float(iterm)
            last_error = float(last_error)
            last_time = float(last_time)
        except NoNodeError:
            iterm = 0.0
            last_error = 0.0
            last_time = 0.0

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_iterm_path)
        zk.ensure_path(zk_last_error_path)
        zk.set(zk_iterm_path, str(iterm))
        zk.set(zk_last_error_path, str(error))

    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    iterm = clamp_value(iterm + (Ki * error) * time_delta)

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_iterm_path)
        zk.ensure_path(zk_last_error_path)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_iterm_path, str(iterm))
        zk.set(zk_last_error_path, str(error))
        zk.set(zk_last_time_path, str(current_time))

    return int(
        round(
            clamp_value(Kp * error + iterm + Kd *
                        (error - last_error) / time_delta)))
Example #8
0
def autoscale_marathon_instance(marathon_service_config, marathon_tasks, mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    if len(marathon_tasks) != current_instances:
        write_to_log(config=marathon_service_config,
                     line='Delaying scaling as marathon is either waiting for resources or is delayed')
        return
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    autoscaling_metrics_provider = get_service_metrics_provider(autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY))
    autoscaling_decision_policy = get_decision_policy(autoscaling_params.pop(DECISION_POLICY_KEY))

    utilization = autoscaling_metrics_provider(marathon_service_config, marathon_tasks,
                                               mesos_tasks, **autoscaling_params)
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params.pop('setpoint'),
        current_instances=current_instances,
    )

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        error=error,
        min_instances=marathon_service_config.get_min_instances(),
        max_instances=marathon_service_config.get_max_instances(),
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        **autoscaling_params
    )

    new_instance_count = marathon_service_config.limit_instance_count(current_instances + autoscaling_amount)
    if new_instance_count != current_instances:
        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' % (
                current_instances, new_instance_count, humanize_error(error)),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)),
            level='debug',
        )
def mesos_cpu_metrics_provider(
    marathon_service_config,
    system_paasta_config,
    marathon_tasks,
    mesos_tasks,
    log_utilization_data={},
    noop=False,
    **kwargs,
):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from
    :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance()

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = "%s/cpu_last_time" % autoscaling_root
    zk_last_cpu_data = "%s/cpu_data" % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time = zk.get(zk_last_time_path)[0].decode("utf8")
            last_cpu_data = zk.get(zk_last_cpu_data)[0].decode("utf8")
            log_utilization_data[last_time] = last_cpu_data
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(",")
                             if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    futures = [asyncio.ensure_future(task.stats()) for task in mesos_tasks]
    if futures:
        a_sync.block(asyncio.wait, futures, timeout=60)

    def results_or_None(fut):
        if fut.exception():
            return None
        else:
            return fut.result()

    mesos_tasks_stats = dict(
        zip(
            [task["id"] for task in mesos_tasks],
            [results_or_None(fut) for fut in futures],
        ))

    current_time = int(datetime.now().strftime("%s"))
    time_delta = current_time - last_time

    mesos_cpu_data = {}
    for task_id, stats in mesos_tasks_stats.items():
        if stats is not None:
            try:
                utime = float(stats["cpus_user_time_secs"])
                stime = float(stats["cpus_system_time_secs"])
                limit = float(stats["cpus_limit"]) - 0.1
                mesos_cpu_data[task_id] = (stime + utime) / limit
            except KeyError:
                pass

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError(
            "Couldn't get any cpu data from Mesos")

    cpu_data_csv = ",".join(f"{cpu_seconds}:{task_id}"
                            for task_id, cpu_seconds in mesos_cpu_data.items())
    log_utilization_data[str(current_time)] = cpu_data_csv

    if not noop:
        with ZookeeperPool() as zk:
            zk.ensure_path(zk_last_cpu_data)
            zk.ensure_path(zk_last_time_path)
            zk.set(zk_last_cpu_data, str(cpu_data_csv).encode("utf8"))
            zk.set(zk_last_time_path, str(current_time).encode("utf8"))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(":")
        if task_id in mesos_cpu_data:
            cputime_delta = mesos_cpu_data[task_id] - float(last_cpu_seconds)
            utilization[task_id] = cputime_delta / time_delta

    if not utilization:
        raise MetricsProviderNoDataError(
            """The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run."""
        )

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)
    return mean_utilization
Example #10
0
def mesos_cpu_ram_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs):
    """
    Gets the average utilization of a service across all of its tasks, where the utilization of
    a task is the maximum value between its cpu and ram utilization.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from

    :returns: the service's average utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time, _ = zk.get(zk_last_time_path)
            last_cpu_data, _ = zk.get(zk_last_cpu_data)
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    mesos_tasks = {task['id']: task.stats for task in mesos_tasks}
    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {task_id: float(stats.get('cpus_system_time_secs', 0.0) + stats.get(
        'cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items()}

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta

    for task_id, stats in mesos_tasks.items():
        if stats.get('mem_limit_bytes', 0) != 0:
            utilization[task_id] = max(
                utilization.get(task_id, 0),
                float(stats.get('mem_rss_bytes', 0)) / stats.get('mem_limit_bytes', 0),
            )

    if not utilization:
        raise MetricsProviderNoDataError("Couldn't get any cpu or ram data from Mesos")

    task_utilization = utilization.values()
    average_utilization = sum(task_utilization) / len(task_utilization)

    cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items())

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_last_cpu_data)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_last_cpu_data, str(cpu_data_csv))
        zk.set(zk_last_time_path, str(current_time))

    return average_utilization