Example #1
0
def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs):
    """
    Gets the average cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from

    :returns: the service's average utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time, _ = zk.get(zk_last_time_path)
            last_cpu_data, _ = zk.get(zk_last_cpu_data)
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    mesos_tasks = {task['id']: task.stats for task in mesos_tasks}
    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {task_id: float(stats.get('cpus_system_time_secs', 0.0) + stats.get(
        'cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items()}

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError("Couldn't get any cpu data from Mesos")

    cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items())

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_last_cpu_data)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_last_cpu_data, str(cpu_data_csv))
        zk.set(zk_last_time_path, str(current_time))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta

    if not utilization:
        raise MetricsProviderNoDataError("""The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run.""")

    task_utilization = utilization.values()
    average_utilization = sum(task_utilization) / len(task_utilization)

    return average_utilization
Example #2
0
def pid_decision_policy(zookeeper_path, current_instances, min_instances,
                        max_instances, error, **kwargs):
    """
    Uses a PID to determine when to autoscale a service.
    See https://en.wikipedia.org/wiki/PID_controller for more information on PIDs.
    Kp, Ki and Kd are the canonical PID constants, where the output of the PID is:
    Kp * error + Ki * integral(error * dt) + Kd * (d(error) / dt)
    """
    min_delta = min_instances - current_instances
    max_delta = max_instances - current_instances

    def clamp_value(number):
        return min(max(number, min_delta), max_delta)

    Kp = 4
    Ki = 4 / AUTOSCALING_DELAY
    Kd = 1 * AUTOSCALING_DELAY

    zk_iterm_path = '%s/pid_iterm' % zookeeper_path
    zk_last_error_path = '%s/pid_last_error' % zookeeper_path
    zk_last_time_path = '%s/pid_last_time' % zookeeper_path

    with ZookeeperPool() as zk:
        try:
            iterm, _ = zk.get(zk_iterm_path)
            last_error, _ = zk.get(zk_last_error_path)
            last_time, _ = zk.get(zk_last_time_path)
            iterm = float(iterm)
            last_error = float(last_error)
            last_time = float(last_time)
        except NoNodeError:
            iterm = 0.0
            last_error = 0.0
            last_time = 0.0

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_iterm_path)
        zk.ensure_path(zk_last_error_path)
        zk.set(zk_iterm_path, str(iterm))
        zk.set(zk_last_error_path, str(error))

    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    iterm = clamp_value(iterm + (Ki * error) * time_delta)

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_iterm_path)
        zk.ensure_path(zk_last_error_path)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_iterm_path, str(iterm))
        zk.set(zk_last_error_path, str(error))
        zk.set(zk_last_time_path, str(current_time))

    return int(
        round(
            clamp_value(Kp * error + iterm + Kd *
                        (error - last_error) / time_delta)))
Example #3
0
def pid_decision_policy(marathon_service_config, error, **kwargs):
    """
    Uses a PID to determine when to autoscale a service.
    See https://en.wikipedia.org/wiki/PID_controller for more information on PIDs.
    Kp, Ki and Kd are the canonical PID constants, where the output of the PID is:
    Kp * error + Ki * integral(error * dt) + Kd * (d(error) / dt)
    """
    Kp = 0.2
    Ki = 0.2 / AUTOSCALING_DELAY
    Kd = 0.05 * AUTOSCALING_DELAY

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_iterm_path = '%s/pid_iterm' % autoscaling_root
    zk_last_error_path = '%s/pid_last_error' % autoscaling_root
    zk_last_time_path = '%s/pid_last_time' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            iterm, _ = zk.get(zk_iterm_path)
            last_error, _ = zk.get(zk_last_error_path)
            last_time, _ = zk.get(zk_last_time_path)
            iterm = float(iterm)
            last_error = float(last_error)
            last_time = float(last_time)
        except NoNodeError:
            iterm = 0.0
            last_error = 0.0
            last_time = 0.0

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_iterm_path)
        zk.ensure_path(zk_last_error_path)
        zk.set(zk_iterm_path, str(iterm))
        zk.set(zk_last_error_path, str(error))

    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    iterm = clamp_value(iterm + (Ki * error) * time_delta)

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_iterm_path)
        zk.ensure_path(zk_last_error_path)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_iterm_path, str(iterm))
        zk.set(zk_last_error_path, str(error))
        zk.set(zk_last_time_path, str(current_time))

    return int(
        round(
            clamp_value(Kp * error + iterm + Kd *
                        (error - last_error) / time_delta)))
Example #4
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    if autoscaling_is_paused():
        log.warning("Skipping autoscaling because autoscaler paused")
        return

    try:
        with create_autoscaling_lock():
            system_paasta_config = load_system_paasta_config()
            cluster = system_paasta_config.get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster,
                                                       soa_dir=soa_dir)

            marathon_clients = get_marathon_clients(
                get_marathon_servers(system_paasta_config))
            apps_with_clients = get_marathon_apps_with_clients(
                marathon_clients.get_all_clients(), embed_tasks=True)
            all_mesos_tasks = get_all_running_tasks()
            if configs:
                with ZookeeperPool():
                    for config in configs:
                        try:
                            marathon_tasks, mesos_tasks = filter_autoscaling_tasks(
                                [app for (app, client) in apps_with_clients],
                                all_mesos_tasks,
                                config,
                            )
                            autoscale_marathon_instance(
                                config, list(marathon_tasks.values()),
                                mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config,
                                         line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning(
            "Skipping autoscaling run for services because the lock is held")
Example #5
0
def fetch_historical_load(zk_path_prefix):
    with ZookeeperPool() as zk:
        try:
            historical_load_bytes, _ = zk.get(zk_historical_load_path(zk_path_prefix))
            return deserialize_historical_load(historical_load_bytes)
        except NoNodeError:
            return []
Example #6
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances(
                ) and service_config.get_desired_state() == 'start':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                all_marathon_tasks = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password(),
                ).list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks(
                    '')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        if config.get_autoscaling_params(
                        )['decision_policy'] != 'bespoke':
                            try:
                                job_id = format_job_id(config.service,
                                                       config.instance)
                                marathon_tasks = {
                                    task.id: task
                                    for task in all_marathon_tasks
                                    if job_id == get_short_job_id(task.id)
                                    and task.health_check_results
                                }
                                if not marathon_tasks:
                                    raise MetricsProviderNoDataError(
                                        "Couldn't find any healthy marathon tasks"
                                    )
                                mesos_tasks = [
                                    task for task in all_mesos_tasks
                                    if task['id'] in marathon_tasks
                                ]
                                autoscale_marathon_instance(
                                    config, list(marathon_tasks.values()),
                                    mesos_tasks)
                            except Exception as e:
                                raise e
                                write_to_log(config=config,
                                             line='Caught Exception %s' % e)
    except LockHeldException:
        pass
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir)
            marathon_config = load_marathon_config()
            marathon_client = get_marathon_client(
                url=marathon_config.get_url(),
                user=marathon_config.get_username(),
                passwd=marathon_config.get_password(),
            )
            all_marathon_tasks, all_mesos_tasks = get_all_marathon_mesos_tasks(marathon_client)
            if configs:
                with ZookeeperPool():
                    for config in configs:
                        try:
                            marathon_tasks, mesos_tasks = filter_autoscaling_tasks(
                                marathon_client,
                                all_marathon_tasks,
                                all_mesos_tasks,
                                config,
                            )
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning("Skipping autoscaling run for services because the lock is held")
def autoscale_service_configs(
    service_configs: Sequence[MarathonServiceConfig],
    system_paasta_config: SystemPaastaConfig,
) -> None:
    if autoscaling_is_paused():
        log.warning("Skipping autoscaling because autoscaler paused")
        return

    marathon_clients = get_marathon_clients(
        get_marathon_servers(system_paasta_config))
    apps_with_clients = get_marathon_apps_with_clients(
        marathon_clients.get_all_clients(), embed_tasks=True)
    all_mesos_tasks = a_sync.block(get_all_running_tasks)
    with ZookeeperPool():
        for config in service_configs:
            try:
                marathon_tasks, mesos_tasks = filter_autoscaling_tasks(
                    [app for (app, client) in apps_with_clients],
                    all_mesos_tasks,
                    config,
                    system_paasta_config,
                )
                autoscale_marathon_instance(
                    config,
                    system_paasta_config,
                    list(marathon_tasks.values()),
                    mesos_tasks,
                )
            except Exception as e:
                write_to_log(config=config,
                             line="Caught Exception %s" % e,
                             level="debug")
def zookeeper_write_bogus_key(context, zookeeper_path):
    with mock.patch.object(
        SystemPaastaConfig, "get_zk_hosts", autospec=True, return_value=context.zk_hosts
    ):
        with ZookeeperPool() as zookeeper_client:
            zookeeper_client.ensure_path(zookeeper_path)
            zookeeper_client.set(zookeeper_path, b"WHATEVER")
Example #10
0
def start_deployd(context):
    try:
        os.makedirs('/nail/etc/services')
    except OSError as e:
        if e.errno == errno.EEXIST:
            pass
    with ZookeeperPool() as zk:
        try:
            zk.create('/autoscaling')
        except NodeExistsError:
            pass
    context.zk_hosts = '%s/mesos-testcluster' % get_service_connection_string(
        'zookeeper')
    context.soa_dir = '/nail/etc/services'
    if not hasattr(context, 'daemon'):
        context.daemon = Popen('paasta-deployd', stderr=PIPE)
    output = context.daemon.stderr.readline().decode('utf-8')
    start = time.time()
    timeout = start + 60
    while "Startup finished!" not in output:
        output = context.daemon.stderr.readline().decode('utf-8')
        if not output:
            raise Exception("deployd exited prematurely")
        print(output.rstrip('\n'))
        if time.time() > timeout:
            raise Exception("deployd never ran")
    time.sleep(5)
Example #11
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir)
            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks('')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service, config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            log.info("Inspecting %s for autoscaling" % job_id)
                            marathon_tasks = {task.id: task for task in all_marathon_tasks
                                              if job_id == get_short_job_id(task.id) and
                                              (is_task_healthy(task) or not
                                               marathon_client.get_app(task.app_id).health_checks)}
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
                            mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning("Skipping autoscaling run for services because the lock is held")
        pass
def delete_service_autoscaler_pause(request):
    with ZookeeperPool() as zk:
        try:
            zk.ensure_path(ZK_PAUSE_AUTOSCALE_PATH)
            zk.delete(ZK_PAUSE_AUTOSCALE_PATH)
        except Exception as e:
            raise ApiFailure(e, 500)
    return
Example #13
0
def set_instances_for_marathon_service(service,
                                       instance,
                                       instance_count,
                                       soa_dir=DEFAULT_SOA_DIR):
    zookeeper_path = '%s/instances' % compose_autoscaling_zookeeper_root(
        service, instance)
    with ZookeeperPool() as zookeeper_client:
        zookeeper_client.ensure_path(zookeeper_path)
        zookeeper_client.set(zookeeper_path, str(instance_count))
Example #14
0
def zookeeper_rmr_keys(context):
    context.zk_hosts = "%s/mesos-testcluster" % get_service_connection_string(
        "zookeeper"
    )
    with mock.patch.object(
        SystemPaastaConfig, "get_zk_hosts", autospec=True, return_value=context.zk_hosts
    ):
        with ZookeeperPool() as zookeeper_client:
            zookeeper_client.delete("/autoscaling/test-service", recursive=True)
def set_instances_for_marathon_service(
    service: str, instance: str, instance_count: int, soa_dir: str = DEFAULT_SOA_DIR
) -> None:
    zookeeper_path = "%s/instances" % compose_autoscaling_zookeeper_root(
        service, instance
    )
    with ZookeeperPool() as zookeeper_client:
        zookeeper_client.ensure_path(zookeeper_path)
        zookeeper_client.set(zookeeper_path, str(instance_count).encode("utf8"))
Example #16
0
def get_boost_factor(zk_boost_path: str) -> float:
    """This function returns the boost factor value if a boost is active"""
    current_time = get_time()

    with ZookeeperPool() as zk:
        boost_values = get_boost_values(zk_boost_path, zk)
        if current_time < boost_values.end_time:
            return boost_values.boost_factor
        else:
            return 1.0
def get_service_autoscaler_pause(request):
    with ZookeeperPool() as zk:
        try:
            pause_until = zk.get(ZK_PAUSE_AUTOSCALE_PATH)[0].decode("utf8")
        except (NoNodeError, ValueError):
            pause_until = "0"
        except Exception as e:
            raise ApiFailure(e, 500)

    return pause_until
Example #18
0
 def run(self):
     self.log.info("paasta-deployd starting up...")
     with ZookeeperPool() as self.zk:
         self.log.info("Waiting to become leader")
         self.election = PaastaLeaderElection(self.zk,
                                              "/paasta-deployd-leader",
                                              socket.getfqdn(),
                                              control=self.control)
         self.is_leader = False
         self.election.run(self.startup)
Example #19
0
def update_service_autoscaler_pause(request):
    minutes = request.swagger_data.get("json_body")["minutes"]
    current_time = time.time()
    expiry_time = current_time + minutes * 60
    with ZookeeperPool() as zk:
        try:
            zk.ensure_path(ZK_PAUSE_AUTOSCALE_PATH)
            zk.set(ZK_PAUSE_AUTOSCALE_PATH, str(expiry_time).encode("utf-8"))
        except Exception as e:
            raise ApiFailure(e, 500)
    return
Example #20
0
def autoscaling_is_paused():
    with ZookeeperPool() as zk:
        try:
            pause_until = zk.get(ZK_PAUSE_AUTOSCALE_PATH)[0].decode('utf8')
            pause_until = float(pause_until)
        except (NoNodeError, ValueError, AttributeError):
            pause_until = 0

    remaining = pause_until - time.time()
    if remaining >= 0:
        log.debug("Autoscaling is paused for {} more seconds".format(str(remaining)))
        return True
    else:
        return False
Example #21
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster,
                                                       soa_dir=soa_dir)
            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_all_running_tasks()
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = config.format_marathon_app_dict()['id']
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy (unless they are "old" in which case we
                            # assume that marathon has screwed up and stopped healthchecking but that
                            # they are healthy
                            log.info("Inspecting %s for autoscaling" % job_id)
                            marathon_tasks = {
                                task.id: task
                                for task in all_marathon_tasks
                                if task.id.startswith(job_id) and
                                (is_task_healthy(task) or not marathon_client.
                                 get_app(task.app_id).health_checks
                                 or is_old_task_missing_healthchecks(
                                     task, marathon_client))
                            }
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError(
                                    "Couldn't find any healthy marathon tasks")
                            mesos_tasks = [
                                task for task in all_mesos_tasks
                                if task['id'] in marathon_tasks
                            ]
                            autoscale_marathon_instance(
                                config, list(marathon_tasks.values()),
                                mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config,
                                         line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning(
            "Skipping autoscaling run for services because the lock is held")
 def run(self) -> None:
     self.log.info("paasta-deployd starting up...")
     startup_counter = self.metrics.create_counter(
         "process_started", paasta_cluster=self.config.get_cluster())
     startup_counter.count()
     with ZookeeperPool() as self.zk:
         self.election = PaastaLeaderElection(
             self.zk,
             "/paasta-deployd-leader",
             socket.getfqdn(),
             control=self.control,
         )
         self.is_leader = False
         self.log.info("Waiting to become leader")
         self.election.run(self.startup)
         self.log.info("Leadership given up, exiting...")
Example #23
0
def get_boosted_load(region: str, pool: str, current_load: float) -> float:
    """Return the load to use for autoscaling calculations, taking into
    account the computed boost, if any.

    This function will fail gracefully no matter what (returning the current load)
    so we don't block the autoscaler.
    """
    try:
        zk_boost_path = get_zk_boost_path(region, pool)
        current_time = get_time()

        with ZookeeperPool() as zk:
            boost_values = get_boost_values(region, pool, zk)

            if current_time >= boost_values.end_time:
                # If there is an expected_load value, that means we've just completed
                # a boost period. Reset it to 0
                if boost_values.expected_load > 0:
                    zk.set(zk_boost_path + '/expected_load',
                           '0'.encode('utf-8'))

                # Boost is no longer active - return current load with no boost
                return current_load

            # Boost is active. If expected load wasn't already computed, set it now.
            if boost_values.expected_load == 0:
                expected_load = current_load * boost_values.boost_factor

                log.debug(
                    'Activating boost, storing expected load: {} in ZooKeeper'.
                    format(expected_load))

                zk.ensure_path(zk_boost_path + '/expected_load')
                zk.set(zk_boost_path + '/expected_load',
                       str(expected_load).encode('utf-8'))

            else:
                expected_load = boost_values.expected_load

            # We return the boosted expected_load, but only if the current load isn't greater.
            return expected_load if expected_load > current_load else current_load

    except Exception as e:
        # Fail gracefully in the face of ANY error
        log.error('get_boost failed with: {}'.format(e))
        return current_load
Example #24
0
def start_deployd(context):
    try:
        os.makedirs('/nail/etc/services')
    except OSError as e:
        if e.errno == errno.EEXIST:
            pass
    with ZookeeperPool() as zk:
        try:
            zk.create('/autoscaling')
        except NodeExistsError:
            pass

    context.soa_dir = '/nail/etc/services'
    if not hasattr(context, 'daemon'):
        context.daemon = DeployDaemon()
        context.daemon.start()
    for i in range(0, 10):
        if context.daemon.started:
            return
        time.sleep(3)
    assert context.daemon.started
Example #25
0
def start_deployd(context):
    try:
        os.makedirs("/nail/etc/services")
    except OSError as e:
        if e.errno == errno.EEXIST:
            pass
    with ZookeeperPool() as zk:
        try:
            zk.create("/autoscaling")
        except NodeExistsError:
            pass
    context.zk_hosts = "%s/mesos-testcluster" % get_service_connection_string(
        "zookeeper")
    context.soa_dir = "/nail/etc/services"
    if not hasattr(context, "daemon"):
        context.daemon = Popen("paasta-deployd", stderr=PIPE)
    output = context.daemon.stderr.readline().decode("utf-8")
    start = time.time()
    timeout = start + 60
    while "Startup finished!" not in output:
        output = context.daemon.stderr.readline().decode("utf-8")
        if not output:
            raise Exception("deployd exited prematurely")
        print(output.rstrip("\n"))
        if time.time() > timeout:
            raise Exception("deployd never ran")

    context.num_workers_crashed = 0

    def dont_let_stderr_buffer():
        while True:
            line = context.daemon.stderr.readline()
            if not line:
                return
            if DEAD_DEPLOYD_WORKER_MESSAGE.encode("utf-8") in line:
                context.num_workers_crashed += 1
            paasta_print(f"deployd stderr: {line}")

    threading.Thread(target=dont_let_stderr_buffer).start()
    time.sleep(5)
Example #26
0
def main(argv=None):
    monkey.patch_all()
    args = parse_paasta_api_args()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    if args.soa_dir:
        settings.soa_dir = args.soa_dir

    server = WSGIServer(('', int(args.port)), make_app())
    log.info("paasta-api started on port %d with soa_dir %s" %
             (args.port, settings.soa_dir))

    try:
        # We create the Zookeeper pool here to prevent the context manager
        # tearing down the client after each request. This can cause an exception
        # if the API is dealing with two or more requests at the same time!
        with ZookeeperPool() as _:  # noqa
            server.serve_forever()
    except KeyboardInterrupt:
        exit(0)
Example #27
0
def set_boost_factor(
    zk_boost_path: str,
    region: str = '',
    pool: str = '',
    send_clusterman_metrics: bool = False,
    factor: float = DEFAULT_BOOST_FACTOR,
    duration_minutes: int = DEFAULT_BOOST_DURATION,
    override: bool = False,
) -> bool:
    """
    Set a boost factor for a path in zk

    Can be used to boost either cluster or service autoscalers.
    If using for cluster you must specify region, pool and set
    send_clusterman_metrics=True so that clusterman metrics are updated

    otherwise just zk_boost_path is enough.
    """
    if factor < MIN_BOOST_FACTOR:
        log.error(f'Cannot set a boost factor smaller than {MIN_BOOST_FACTOR}')
        return False

    if factor > MAX_BOOST_FACTOR:
        log.warning(
            'Boost factor {} does not sound reasonable. Defaulting to {}'.
            format(
                factor,
                MAX_BOOST_FACTOR,
            ))
        factor = MAX_BOOST_FACTOR

    if duration_minutes > MAX_BOOST_DURATION:
        log.warning(
            'Boost duration of {} minutes is too much. Falling back to {}.'.
            format(
                duration_minutes,
                MAX_BOOST_DURATION,
            ))
        duration_minutes = MAX_BOOST_DURATION

    current_time = get_time()
    end_time = current_time + 60 * duration_minutes

    if clusterman_metrics and send_clusterman_metrics:
        cluster = load_system_paasta_config().get_cluster()
        metrics_client = clusterman_metrics.ClustermanMetricsBotoClient(
            region_name=region, app_identifier=pool)
        with metrics_client.get_writer(
                clusterman_metrics.APP_METRICS) as writer:
            metrics_key = clusterman_metrics.generate_key_with_dimensions(
                'boost_factor',
                {
                    'cluster': cluster,
                    'pool': pool
                },
            )
            writer.send((metrics_key, current_time, factor))
            if duration_minutes > 0:
                writer.send((metrics_key, end_time, 1.0))

    zk_end_time_path = zk_boost_path + '/end_time'
    zk_factor_path = zk_boost_path + '/factor'
    zk_expected_load_path = zk_boost_path + '/expected_load'

    with ZookeeperPool() as zk:
        if (not override and
                current_time < get_boost_values(zk_boost_path, zk).end_time):
            log.error('Boost already active. Not overriding.')
            return False

        try:
            zk.ensure_path(zk_end_time_path)
            zk.ensure_path(zk_factor_path)
            zk.ensure_path(zk_expected_load_path)
            zk.set(zk_end_time_path, str(end_time).encode('utf-8'))
            zk.set(zk_factor_path, str(factor).encode('utf-8'))
            zk.set(zk_expected_load_path, '0'.encode('utf-8'))
        except Exception:
            log.error('Error setting the boost in Zookeeper')
            raise

        log.info(
            'Load boost: Set capacity boost factor {} at path {} until {}'.
            format(
                factor,
                zk_boost_path,
                datetime.fromtimestamp(end_time).strftime('%c'),
            ))

        # Let's check that this factor has been properly written to zk
        return get_boost_values(zk_boost_path, zk) == BoostValues(
            end_time=end_time,
            boost_factor=factor,
            expected_load=0,
        )
Example #28
0
def mesos_cpu_metrics_provider(
    marathon_service_config,
    system_paasta_config,
    marathon_tasks,
    mesos_tasks,
    log_utilization_data={},
    noop=False,
    **kwargs,
):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from
    :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance()

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = "%s/cpu_last_time" % autoscaling_root
    zk_last_cpu_data = "%s/cpu_data" % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time = zk.get(zk_last_time_path)[0].decode("utf8")
            last_cpu_data = zk.get(zk_last_cpu_data)[0].decode("utf8")
            log_utilization_data[last_time] = last_cpu_data
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(",")
                             if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    futures = [asyncio.ensure_future(task.stats()) for task in mesos_tasks]
    if futures:
        a_sync.block(asyncio.wait, futures, timeout=60)

    def results_or_None(fut):
        if fut.exception():
            return None
        else:
            return fut.result()

    mesos_tasks_stats = dict(
        zip(
            [task["id"] for task in mesos_tasks],
            [results_or_None(fut) for fut in futures],
        ))

    current_time = int(datetime.now().strftime("%s"))
    time_delta = current_time - last_time

    mesos_cpu_data = {}
    for task_id, stats in mesos_tasks_stats.items():
        if stats is not None:
            try:
                utime = float(stats["cpus_user_time_secs"])
                stime = float(stats["cpus_system_time_secs"])
                limit = float(stats["cpus_limit"]) - 0.1
                mesos_cpu_data[task_id] = (stime + utime) / limit
            except KeyError:
                pass

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError(
            "Couldn't get any cpu data from Mesos")

    cpu_data_csv = ",".join(f"{cpu_seconds}:{task_id}"
                            for task_id, cpu_seconds in mesos_cpu_data.items())
    log_utilization_data[str(current_time)] = cpu_data_csv

    if not noop:
        with ZookeeperPool() as zk:
            zk.ensure_path(zk_last_cpu_data)
            zk.ensure_path(zk_last_time_path)
            zk.set(zk_last_cpu_data, str(cpu_data_csv).encode("utf8"))
            zk.set(zk_last_time_path, str(current_time).encode("utf8"))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(":")
        if task_id in mesos_cpu_data:
            cputime_delta = mesos_cpu_data[task_id] - float(last_cpu_seconds)
            utilization[task_id] = cputime_delta / time_delta

    if not utilization:
        raise MetricsProviderNoDataError(
            """The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run."""
        )

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)
    return mean_utilization
Example #29
0
def save_historical_load(historical_load, zk_path_prefix):
    with ZookeeperPool() as zk:
        historical_load_bytes = serialize_historical_load(historical_load)
        zk.ensure_path(zk_historical_load_path(zk_path_prefix))
        zk.set(zk_historical_load_path(zk_path_prefix), historical_load_bytes)
def get_instances_from_zookeeper(service: str, instance: str) -> int:
    with ZookeeperPool() as zookeeper_client:
        (instances, _) = zookeeper_client.get(
            '%s/instances' %
            compose_autoscaling_zookeeper_root(service, instance))
        return int(instances)