def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs): """ Gets the average cpu utilization of a service across all of its tasks. :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param mesos_tasks: Mesos tasks to get data from :returns: the service's average utilization, from 0 to 1 """ autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_last_time_path = '%s/cpu_last_time' % autoscaling_root zk_last_cpu_data = '%s/cpu_data' % autoscaling_root with ZookeeperPool() as zk: try: last_time, _ = zk.get(zk_last_time_path) last_cpu_data, _ = zk.get(zk_last_cpu_data) last_time = float(last_time) last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum) except NoNodeError: last_time = 0.0 last_cpu_data = [] mesos_tasks = {task['id']: task.stats for task in mesos_tasks} current_time = int(datetime.now().strftime('%s')) time_delta = current_time - last_time mesos_cpu_data = {task_id: float(stats.get('cpus_system_time_secs', 0.0) + stats.get( 'cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items()} if not mesos_cpu_data: raise MetricsProviderNoDataError("Couldn't get any cpu data from Mesos") cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items()) with ZookeeperPool() as zk: zk.ensure_path(zk_last_cpu_data) zk.ensure_path(zk_last_time_path) zk.set(zk_last_cpu_data, str(cpu_data_csv)) zk.set(zk_last_time_path, str(current_time)) utilization = {} for datum in last_cpu_data: last_cpu_seconds, task_id = datum.split(':') if task_id in mesos_cpu_data: utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta if not utilization: raise MetricsProviderNoDataError("""The mesos_cpu metrics provider doesn't have Zookeeper data for this service. This is expected for its first run.""") task_utilization = utilization.values() average_utilization = sum(task_utilization) / len(task_utilization) return average_utilization
def pid_decision_policy(zookeeper_path, current_instances, min_instances, max_instances, error, **kwargs): """ Uses a PID to determine when to autoscale a service. See https://en.wikipedia.org/wiki/PID_controller for more information on PIDs. Kp, Ki and Kd are the canonical PID constants, where the output of the PID is: Kp * error + Ki * integral(error * dt) + Kd * (d(error) / dt) """ min_delta = min_instances - current_instances max_delta = max_instances - current_instances def clamp_value(number): return min(max(number, min_delta), max_delta) Kp = 4 Ki = 4 / AUTOSCALING_DELAY Kd = 1 * AUTOSCALING_DELAY zk_iterm_path = '%s/pid_iterm' % zookeeper_path zk_last_error_path = '%s/pid_last_error' % zookeeper_path zk_last_time_path = '%s/pid_last_time' % zookeeper_path with ZookeeperPool() as zk: try: iterm, _ = zk.get(zk_iterm_path) last_error, _ = zk.get(zk_last_error_path) last_time, _ = zk.get(zk_last_time_path) iterm = float(iterm) last_error = float(last_error) last_time = float(last_time) except NoNodeError: iterm = 0.0 last_error = 0.0 last_time = 0.0 with ZookeeperPool() as zk: zk.ensure_path(zk_iterm_path) zk.ensure_path(zk_last_error_path) zk.set(zk_iterm_path, str(iterm)) zk.set(zk_last_error_path, str(error)) current_time = int(datetime.now().strftime('%s')) time_delta = current_time - last_time iterm = clamp_value(iterm + (Ki * error) * time_delta) with ZookeeperPool() as zk: zk.ensure_path(zk_iterm_path) zk.ensure_path(zk_last_error_path) zk.ensure_path(zk_last_time_path) zk.set(zk_iterm_path, str(iterm)) zk.set(zk_last_error_path, str(error)) zk.set(zk_last_time_path, str(current_time)) return int( round( clamp_value(Kp * error + iterm + Kd * (error - last_error) / time_delta)))
def pid_decision_policy(marathon_service_config, error, **kwargs): """ Uses a PID to determine when to autoscale a service. See https://en.wikipedia.org/wiki/PID_controller for more information on PIDs. Kp, Ki and Kd are the canonical PID constants, where the output of the PID is: Kp * error + Ki * integral(error * dt) + Kd * (d(error) / dt) """ Kp = 0.2 Ki = 0.2 / AUTOSCALING_DELAY Kd = 0.05 * AUTOSCALING_DELAY autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_iterm_path = '%s/pid_iterm' % autoscaling_root zk_last_error_path = '%s/pid_last_error' % autoscaling_root zk_last_time_path = '%s/pid_last_time' % autoscaling_root with ZookeeperPool() as zk: try: iterm, _ = zk.get(zk_iterm_path) last_error, _ = zk.get(zk_last_error_path) last_time, _ = zk.get(zk_last_time_path) iterm = float(iterm) last_error = float(last_error) last_time = float(last_time) except NoNodeError: iterm = 0.0 last_error = 0.0 last_time = 0.0 with ZookeeperPool() as zk: zk.ensure_path(zk_iterm_path) zk.ensure_path(zk_last_error_path) zk.set(zk_iterm_path, str(iterm)) zk.set(zk_last_error_path, str(error)) current_time = int(datetime.now().strftime('%s')) time_delta = current_time - last_time iterm = clamp_value(iterm + (Ki * error) * time_delta) with ZookeeperPool() as zk: zk.ensure_path(zk_iterm_path) zk.ensure_path(zk_last_error_path) zk.ensure_path(zk_last_time_path) zk.set(zk_iterm_path, str(iterm)) zk.set(zk_last_error_path, str(error)) zk.set(zk_last_time_path, str(current_time)) return int( round( clamp_value(Kp * error + iterm + Kd * (error - last_error) / time_delta)))
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): if autoscaling_is_paused(): log.warning("Skipping autoscaling because autoscaler paused") return try: with create_autoscaling_lock(): system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) marathon_clients = get_marathon_clients( get_marathon_servers(system_paasta_config)) apps_with_clients = get_marathon_apps_with_clients( marathon_clients.get_all_clients(), embed_tasks=True) all_mesos_tasks = get_all_running_tasks() if configs: with ZookeeperPool(): for config in configs: try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( [app for (app, client) in apps_with_clients], all_mesos_tasks, config, ) autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning( "Skipping autoscaling run for services because the lock is held")
def fetch_historical_load(zk_path_prefix): with ZookeeperPool() as zk: try: historical_load_bytes, _ = zk.get(zk_historical_load_path(zk_path_prefix)) return deserialize_historical_load(historical_load_bytes) except NoNodeError: return []
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() services = get_services_for_cluster( cluster=cluster, instance_type='marathon', soa_dir=soa_dir, ) configs = [] for service, instance in services: service_config = load_marathon_service_config( service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, ) if service_config.get_max_instances( ) and service_config.get_desired_state() == 'start': configs.append(service_config) if configs: marathon_config = load_marathon_config() all_marathon_tasks = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ).list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks( '') # empty string matches all app ids with ZookeeperPool(): for config in configs: if config.get_autoscaling_params( )['decision_policy'] != 'bespoke': try: job_id = format_job_id(config.service, config.instance) marathon_tasks = { task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and task.health_check_results } if not marathon_tasks: raise MetricsProviderNoDataError( "Couldn't find any healthy marathon tasks" ) mesos_tasks = [ task for task in all_mesos_tasks if task['id'] in marathon_tasks ] autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: raise e write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: pass
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password(), ) all_marathon_tasks, all_mesos_tasks = get_all_marathon_mesos_tasks(marathon_client) if configs: with ZookeeperPool(): for config in configs: try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( marathon_client, all_marathon_tasks, all_mesos_tasks, config, ) autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning("Skipping autoscaling run for services because the lock is held")
def autoscale_service_configs( service_configs: Sequence[MarathonServiceConfig], system_paasta_config: SystemPaastaConfig, ) -> None: if autoscaling_is_paused(): log.warning("Skipping autoscaling because autoscaler paused") return marathon_clients = get_marathon_clients( get_marathon_servers(system_paasta_config)) apps_with_clients = get_marathon_apps_with_clients( marathon_clients.get_all_clients(), embed_tasks=True) all_mesos_tasks = a_sync.block(get_all_running_tasks) with ZookeeperPool(): for config in service_configs: try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( [app for (app, client) in apps_with_clients], all_mesos_tasks, config, system_paasta_config, ) autoscale_marathon_instance( config, system_paasta_config, list(marathon_tasks.values()), mesos_tasks, ) except Exception as e: write_to_log(config=config, line="Caught Exception %s" % e, level="debug")
def zookeeper_write_bogus_key(context, zookeeper_path): with mock.patch.object( SystemPaastaConfig, "get_zk_hosts", autospec=True, return_value=context.zk_hosts ): with ZookeeperPool() as zookeeper_client: zookeeper_client.ensure_path(zookeeper_path) zookeeper_client.set(zookeeper_path, b"WHATEVER")
def start_deployd(context): try: os.makedirs('/nail/etc/services') except OSError as e: if e.errno == errno.EEXIST: pass with ZookeeperPool() as zk: try: zk.create('/autoscaling') except NodeExistsError: pass context.zk_hosts = '%s/mesos-testcluster' % get_service_connection_string( 'zookeeper') context.soa_dir = '/nail/etc/services' if not hasattr(context, 'daemon'): context.daemon = Popen('paasta-deployd', stderr=PIPE) output = context.daemon.stderr.readline().decode('utf-8') start = time.time() timeout = start + 60 while "Startup finished!" not in output: output = context.daemon.stderr.readline().decode('utf-8') if not output: raise Exception("deployd exited prematurely") print(output.rstrip('\n')) if time.time() > timeout: raise Exception("deployd never ran") time.sleep(5)
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_running_tasks_from_active_frameworks('') # empty string matches all app ids with ZookeeperPool(): for config in configs: try: job_id = format_job_id(config.service, config.instance) # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy. log.info("Inspecting %s for autoscaling" % job_id) marathon_tasks = {task.id: task for task in all_marathon_tasks if job_id == get_short_job_id(task.id) and (is_task_healthy(task) or not marathon_client.get_app(task.app_id).health_checks)} if not marathon_tasks: raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks") mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks] autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning("Skipping autoscaling run for services because the lock is held") pass
def delete_service_autoscaler_pause(request): with ZookeeperPool() as zk: try: zk.ensure_path(ZK_PAUSE_AUTOSCALE_PATH) zk.delete(ZK_PAUSE_AUTOSCALE_PATH) except Exception as e: raise ApiFailure(e, 500) return
def set_instances_for_marathon_service(service, instance, instance_count, soa_dir=DEFAULT_SOA_DIR): zookeeper_path = '%s/instances' % compose_autoscaling_zookeeper_root( service, instance) with ZookeeperPool() as zookeeper_client: zookeeper_client.ensure_path(zookeeper_path) zookeeper_client.set(zookeeper_path, str(instance_count))
def zookeeper_rmr_keys(context): context.zk_hosts = "%s/mesos-testcluster" % get_service_connection_string( "zookeeper" ) with mock.patch.object( SystemPaastaConfig, "get_zk_hosts", autospec=True, return_value=context.zk_hosts ): with ZookeeperPool() as zookeeper_client: zookeeper_client.delete("/autoscaling/test-service", recursive=True)
def set_instances_for_marathon_service( service: str, instance: str, instance_count: int, soa_dir: str = DEFAULT_SOA_DIR ) -> None: zookeeper_path = "%s/instances" % compose_autoscaling_zookeeper_root( service, instance ) with ZookeeperPool() as zookeeper_client: zookeeper_client.ensure_path(zookeeper_path) zookeeper_client.set(zookeeper_path, str(instance_count).encode("utf8"))
def get_boost_factor(zk_boost_path: str) -> float: """This function returns the boost factor value if a boost is active""" current_time = get_time() with ZookeeperPool() as zk: boost_values = get_boost_values(zk_boost_path, zk) if current_time < boost_values.end_time: return boost_values.boost_factor else: return 1.0
def get_service_autoscaler_pause(request): with ZookeeperPool() as zk: try: pause_until = zk.get(ZK_PAUSE_AUTOSCALE_PATH)[0].decode("utf8") except (NoNodeError, ValueError): pause_until = "0" except Exception as e: raise ApiFailure(e, 500) return pause_until
def run(self): self.log.info("paasta-deployd starting up...") with ZookeeperPool() as self.zk: self.log.info("Waiting to become leader") self.election = PaastaLeaderElection(self.zk, "/paasta-deployd-leader", socket.getfqdn(), control=self.control) self.is_leader = False self.election.run(self.startup)
def update_service_autoscaler_pause(request): minutes = request.swagger_data.get("json_body")["minutes"] current_time = time.time() expiry_time = current_time + minutes * 60 with ZookeeperPool() as zk: try: zk.ensure_path(ZK_PAUSE_AUTOSCALE_PATH) zk.set(ZK_PAUSE_AUTOSCALE_PATH, str(expiry_time).encode("utf-8")) except Exception as e: raise ApiFailure(e, 500) return
def autoscaling_is_paused(): with ZookeeperPool() as zk: try: pause_until = zk.get(ZK_PAUSE_AUTOSCALE_PATH)[0].decode('utf8') pause_until = float(pause_until) except (NoNodeError, ValueError, AttributeError): pause_until = 0 remaining = pause_until - time.time() if remaining >= 0: log.debug("Autoscaling is paused for {} more seconds".format(str(remaining))) return True else: return False
def autoscale_services(soa_dir=DEFAULT_SOA_DIR): try: with create_autoscaling_lock(): cluster = load_system_paasta_config().get_cluster() configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir) if configs: marathon_config = load_marathon_config() marathon_client = get_marathon_client( url=marathon_config.get_url(), user=marathon_config.get_username(), passwd=marathon_config.get_password()) all_marathon_tasks = marathon_client.list_tasks() all_mesos_tasks = get_all_running_tasks() with ZookeeperPool(): for config in configs: try: job_id = config.format_marathon_app_dict()['id'] # Get a dict of healthy tasks, we assume tasks with no healthcheck defined # are healthy. We assume tasks with no healthcheck results but a defined # healthcheck to be unhealthy (unless they are "old" in which case we # assume that marathon has screwed up and stopped healthchecking but that # they are healthy log.info("Inspecting %s for autoscaling" % job_id) marathon_tasks = { task.id: task for task in all_marathon_tasks if task.id.startswith(job_id) and (is_task_healthy(task) or not marathon_client. get_app(task.app_id).health_checks or is_old_task_missing_healthchecks( task, marathon_client)) } if not marathon_tasks: raise MetricsProviderNoDataError( "Couldn't find any healthy marathon tasks") mesos_tasks = [ task for task in all_mesos_tasks if task['id'] in marathon_tasks ] autoscale_marathon_instance( config, list(marathon_tasks.values()), mesos_tasks) except Exception as e: write_to_log(config=config, line='Caught Exception %s' % e) except LockHeldException: log.warning( "Skipping autoscaling run for services because the lock is held")
def run(self) -> None: self.log.info("paasta-deployd starting up...") startup_counter = self.metrics.create_counter( "process_started", paasta_cluster=self.config.get_cluster()) startup_counter.count() with ZookeeperPool() as self.zk: self.election = PaastaLeaderElection( self.zk, "/paasta-deployd-leader", socket.getfqdn(), control=self.control, ) self.is_leader = False self.log.info("Waiting to become leader") self.election.run(self.startup) self.log.info("Leadership given up, exiting...")
def get_boosted_load(region: str, pool: str, current_load: float) -> float: """Return the load to use for autoscaling calculations, taking into account the computed boost, if any. This function will fail gracefully no matter what (returning the current load) so we don't block the autoscaler. """ try: zk_boost_path = get_zk_boost_path(region, pool) current_time = get_time() with ZookeeperPool() as zk: boost_values = get_boost_values(region, pool, zk) if current_time >= boost_values.end_time: # If there is an expected_load value, that means we've just completed # a boost period. Reset it to 0 if boost_values.expected_load > 0: zk.set(zk_boost_path + '/expected_load', '0'.encode('utf-8')) # Boost is no longer active - return current load with no boost return current_load # Boost is active. If expected load wasn't already computed, set it now. if boost_values.expected_load == 0: expected_load = current_load * boost_values.boost_factor log.debug( 'Activating boost, storing expected load: {} in ZooKeeper'. format(expected_load)) zk.ensure_path(zk_boost_path + '/expected_load') zk.set(zk_boost_path + '/expected_load', str(expected_load).encode('utf-8')) else: expected_load = boost_values.expected_load # We return the boosted expected_load, but only if the current load isn't greater. return expected_load if expected_load > current_load else current_load except Exception as e: # Fail gracefully in the face of ANY error log.error('get_boost failed with: {}'.format(e)) return current_load
def start_deployd(context): try: os.makedirs('/nail/etc/services') except OSError as e: if e.errno == errno.EEXIST: pass with ZookeeperPool() as zk: try: zk.create('/autoscaling') except NodeExistsError: pass context.soa_dir = '/nail/etc/services' if not hasattr(context, 'daemon'): context.daemon = DeployDaemon() context.daemon.start() for i in range(0, 10): if context.daemon.started: return time.sleep(3) assert context.daemon.started
def start_deployd(context): try: os.makedirs("/nail/etc/services") except OSError as e: if e.errno == errno.EEXIST: pass with ZookeeperPool() as zk: try: zk.create("/autoscaling") except NodeExistsError: pass context.zk_hosts = "%s/mesos-testcluster" % get_service_connection_string( "zookeeper") context.soa_dir = "/nail/etc/services" if not hasattr(context, "daemon"): context.daemon = Popen("paasta-deployd", stderr=PIPE) output = context.daemon.stderr.readline().decode("utf-8") start = time.time() timeout = start + 60 while "Startup finished!" not in output: output = context.daemon.stderr.readline().decode("utf-8") if not output: raise Exception("deployd exited prematurely") print(output.rstrip("\n")) if time.time() > timeout: raise Exception("deployd never ran") context.num_workers_crashed = 0 def dont_let_stderr_buffer(): while True: line = context.daemon.stderr.readline() if not line: return if DEAD_DEPLOYD_WORKER_MESSAGE.encode("utf-8") in line: context.num_workers_crashed += 1 paasta_print(f"deployd stderr: {line}") threading.Thread(target=dont_let_stderr_buffer).start() time.sleep(5)
def main(argv=None): monkey.patch_all() args = parse_paasta_api_args() if args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) if args.soa_dir: settings.soa_dir = args.soa_dir server = WSGIServer(('', int(args.port)), make_app()) log.info("paasta-api started on port %d with soa_dir %s" % (args.port, settings.soa_dir)) try: # We create the Zookeeper pool here to prevent the context manager # tearing down the client after each request. This can cause an exception # if the API is dealing with two or more requests at the same time! with ZookeeperPool() as _: # noqa server.serve_forever() except KeyboardInterrupt: exit(0)
def set_boost_factor( zk_boost_path: str, region: str = '', pool: str = '', send_clusterman_metrics: bool = False, factor: float = DEFAULT_BOOST_FACTOR, duration_minutes: int = DEFAULT_BOOST_DURATION, override: bool = False, ) -> bool: """ Set a boost factor for a path in zk Can be used to boost either cluster or service autoscalers. If using for cluster you must specify region, pool and set send_clusterman_metrics=True so that clusterman metrics are updated otherwise just zk_boost_path is enough. """ if factor < MIN_BOOST_FACTOR: log.error(f'Cannot set a boost factor smaller than {MIN_BOOST_FACTOR}') return False if factor > MAX_BOOST_FACTOR: log.warning( 'Boost factor {} does not sound reasonable. Defaulting to {}'. format( factor, MAX_BOOST_FACTOR, )) factor = MAX_BOOST_FACTOR if duration_minutes > MAX_BOOST_DURATION: log.warning( 'Boost duration of {} minutes is too much. Falling back to {}.'. format( duration_minutes, MAX_BOOST_DURATION, )) duration_minutes = MAX_BOOST_DURATION current_time = get_time() end_time = current_time + 60 * duration_minutes if clusterman_metrics and send_clusterman_metrics: cluster = load_system_paasta_config().get_cluster() metrics_client = clusterman_metrics.ClustermanMetricsBotoClient( region_name=region, app_identifier=pool) with metrics_client.get_writer( clusterman_metrics.APP_METRICS) as writer: metrics_key = clusterman_metrics.generate_key_with_dimensions( 'boost_factor', { 'cluster': cluster, 'pool': pool }, ) writer.send((metrics_key, current_time, factor)) if duration_minutes > 0: writer.send((metrics_key, end_time, 1.0)) zk_end_time_path = zk_boost_path + '/end_time' zk_factor_path = zk_boost_path + '/factor' zk_expected_load_path = zk_boost_path + '/expected_load' with ZookeeperPool() as zk: if (not override and current_time < get_boost_values(zk_boost_path, zk).end_time): log.error('Boost already active. Not overriding.') return False try: zk.ensure_path(zk_end_time_path) zk.ensure_path(zk_factor_path) zk.ensure_path(zk_expected_load_path) zk.set(zk_end_time_path, str(end_time).encode('utf-8')) zk.set(zk_factor_path, str(factor).encode('utf-8')) zk.set(zk_expected_load_path, '0'.encode('utf-8')) except Exception: log.error('Error setting the boost in Zookeeper') raise log.info( 'Load boost: Set capacity boost factor {} at path {} until {}'. format( factor, zk_boost_path, datetime.fromtimestamp(end_time).strftime('%c'), )) # Let's check that this factor has been properly written to zk return get_boost_values(zk_boost_path, zk) == BoostValues( end_time=end_time, boost_factor=factor, expected_load=0, )
def mesos_cpu_metrics_provider( marathon_service_config, system_paasta_config, marathon_tasks, mesos_tasks, log_utilization_data={}, noop=False, **kwargs, ): """ Gets the mean cpu utilization of a service across all of its tasks. :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param mesos_tasks: Mesos tasks to get data from :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance() :returns: the service's mean utilization, from 0 to 1 """ autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_last_time_path = "%s/cpu_last_time" % autoscaling_root zk_last_cpu_data = "%s/cpu_data" % autoscaling_root with ZookeeperPool() as zk: try: last_time = zk.get(zk_last_time_path)[0].decode("utf8") last_cpu_data = zk.get(zk_last_cpu_data)[0].decode("utf8") log_utilization_data[last_time] = last_cpu_data last_time = float(last_time) last_cpu_data = (datum for datum in last_cpu_data.split(",") if datum) except NoNodeError: last_time = 0.0 last_cpu_data = [] futures = [asyncio.ensure_future(task.stats()) for task in mesos_tasks] if futures: a_sync.block(asyncio.wait, futures, timeout=60) def results_or_None(fut): if fut.exception(): return None else: return fut.result() mesos_tasks_stats = dict( zip( [task["id"] for task in mesos_tasks], [results_or_None(fut) for fut in futures], )) current_time = int(datetime.now().strftime("%s")) time_delta = current_time - last_time mesos_cpu_data = {} for task_id, stats in mesos_tasks_stats.items(): if stats is not None: try: utime = float(stats["cpus_user_time_secs"]) stime = float(stats["cpus_system_time_secs"]) limit = float(stats["cpus_limit"]) - 0.1 mesos_cpu_data[task_id] = (stime + utime) / limit except KeyError: pass if not mesos_cpu_data: raise MetricsProviderNoDataError( "Couldn't get any cpu data from Mesos") cpu_data_csv = ",".join(f"{cpu_seconds}:{task_id}" for task_id, cpu_seconds in mesos_cpu_data.items()) log_utilization_data[str(current_time)] = cpu_data_csv if not noop: with ZookeeperPool() as zk: zk.ensure_path(zk_last_cpu_data) zk.ensure_path(zk_last_time_path) zk.set(zk_last_cpu_data, str(cpu_data_csv).encode("utf8")) zk.set(zk_last_time_path, str(current_time).encode("utf8")) utilization = {} for datum in last_cpu_data: last_cpu_seconds, task_id = datum.split(":") if task_id in mesos_cpu_data: cputime_delta = mesos_cpu_data[task_id] - float(last_cpu_seconds) utilization[task_id] = cputime_delta / time_delta if not utilization: raise MetricsProviderNoDataError( """The mesos_cpu metrics provider doesn't have Zookeeper data for this service. This is expected for its first run.""" ) task_utilization = utilization.values() mean_utilization = mean(task_utilization) return mean_utilization
def save_historical_load(historical_load, zk_path_prefix): with ZookeeperPool() as zk: historical_load_bytes = serialize_historical_load(historical_load) zk.ensure_path(zk_historical_load_path(zk_path_prefix)) zk.set(zk_historical_load_path(zk_path_prefix), historical_load_bytes)
def get_instances_from_zookeeper(service: str, instance: str) -> int: with ZookeeperPool() as zookeeper_client: (instances, _) = zookeeper_client.get( '%s/instances' % compose_autoscaling_zookeeper_root(service, instance)) return int(instances)