def marathon_job_status(mstatus, client, job_config, verbose): try: app_id = job_config.format_marathon_app_dict()['id'] except NoDockerImageError: error_msg = "Docker image is not in deployments.json." mstatus['error_message'] = error_msg return mstatus['app_id'] = app_id if verbose is True: mstatus['slaves'] = list( {a_sync.block(task.slave)['hostname'] for task in a_sync.block(get_running_tasks_from_frameworks, app_id)}, ) mstatus['expected_instance_count'] = job_config.get_instances() try: app = client.get_app(app_id) except marathon.exceptions.NotFoundError: mstatus['deploy_status'] = marathon_tools.MarathonDeployStatus.tostring( marathon_tools.MarathonDeployStatus.NotRunning, ) mstatus['running_instance_count'] = 0 else: deploy_status = marathon_tools.get_marathon_app_deploy_status(client, app) mstatus['deploy_status'] = marathon_tools.MarathonDeployStatus.tostring(deploy_status) # by comparing running count with expected count, callers can figure # out if the instance is in Healthy, Warning or Critical state. mstatus['running_instance_count'] = app.tasks_running if deploy_status == marathon_tools.MarathonDeployStatus.Delayed: _, backoff_seconds = marathon_tools.get_app_queue_status(client, app_id) mstatus['backoff_seconds'] = backoff_seconds
def undrain_tasks( to_undrain: Collection[MarathonTask], leave_draining: Collection[MarathonTask], drain_method: drain_lib.DrainMethod, log_deploy_error: LogDeployError, ) -> None: # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with # `paasta mark-for-deployment`), then we should undrain them. async def undrain_task(task: MarathonTask) -> None: if task not in leave_draining: if task.state == 'TASK_UNREACHABLE': return try: await drain_method.stop_draining(task) except Exception as e: log_deploy_error( "Ignoring exception during stop_draining of task %s: %s." % (task, e)) if to_undrain: a_sync.block( asyncio.wait, [asyncio.ensure_future(undrain_task(task)) for task in to_undrain], )
def kill_tasks_if_necessary(self, driver: MesosSchedulerDriver): base_task = self.service_config.base_task(self.system_paasta_config) all_tasks_with_params = self.task_store.get_all_tasks() new_tasks_with_params = self.get_new_tasks(base_task['name'], all_tasks_with_params) happy_new_tasks_with_params = self.get_happy_tasks(new_tasks_with_params) desired_instances = self.service_config.get_desired_instances() # this puts the most-desired tasks first. I would have left them in order of bad->good and used # new_tasks_by_desirability[:-desired_instances] instead, but list[:-0] is an empty list, rather than the full # list. new_task_ids_by_desirability = sorted( list(new_tasks_with_params.keys()), key=self.make_healthiness_sorter(base_task['name'], all_tasks_with_params), reverse=True, ) new_task_ids_to_kill = new_task_ids_by_desirability[desired_instances:] old_tasks_with_params = self.get_old_tasks(base_task['name'], all_tasks_with_params) old_draining_tasks_with_params = self.get_draining_tasks(old_tasks_with_params) old_non_draining_tasks = sorted( list( set(old_tasks_with_params.keys()) - set(old_draining_tasks_with_params), ), key=self.make_healthiness_sorter(base_task['name'], all_tasks_with_params), reverse=True, ) actions = bounce_lib.crossover_bounce( new_config={"instances": desired_instances}, new_app_running=True, happy_new_tasks=happy_new_tasks_with_params.keys(), old_non_draining_tasks=new_task_ids_to_kill + old_non_draining_tasks, ) with a_sync.idle_event_loop(): futures = [] for task in set(new_tasks_with_params.keys()) - set(actions['tasks_to_drain']): futures.append(asyncio.ensure_future(self.undrain_task(task))) for task in actions['tasks_to_drain']: futures.append(asyncio.ensure_future(self.drain_task(task))) if futures: a_sync.block(asyncio.wait, futures) async def kill_if_safe_to_kill(task_id: str): if await self.drain_method.is_safe_to_kill(self.make_drain_task(task_id)): self.kill_task(driver, task_id) futures = [] for task, parameters in all_tasks_with_params.items(): if parameters.is_draining and parameters.mesos_task_state in LIVE_TASK_STATES: futures.append(asyncio.ensure_future(kill_if_safe_to_kill(task))) if futures: a_sync.block(asyncio.wait, futures)
def get_count_running_tasks_on_slave(hostname): """Return the number of tasks running on a paticular slave or 0 if the slave is not found. :param hostname: hostname of the slave :returns: integer count of mesos tasks""" mesos_state = a_sync.block(get_mesos_master().state_summary) task_counts = a_sync.block(get_mesos_task_count_by_slave, mesos_state) counts = [slave['task_counts'].count for slave in task_counts if slave['task_counts'].slave['hostname'] == hostname] if counts: return counts[0] else: return 0
def get_tasks_by_state_for_app( app: MarathonApp, drain_method: drain_lib.DrainMethod, service: str, nerve_ns: str, bounce_health_params: Dict[str, Any], system_paasta_config: SystemPaastaConfig, log_deploy_error: LogDeployError, draining_hosts: Collection[str], ) -> TasksByStateDict: tasks_by_state: TasksByStateDict = { "happy": set(), "unhappy": set(), "draining": set(), "at_risk": set(), } happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns, system_paasta_config, **bounce_health_params) async def categorize_task(task: MarathonTask) -> None: try: is_draining = await drain_method.is_draining(task) except Exception as e: log_deploy_error( f"Ignoring {type(e).__name__} exception during is_draining of task " f"{task.id} {e.args}. Treating task as 'unhappy'.") state = "unhappy" else: if is_draining is True: state = "draining" elif task in happy_tasks: if task.host in draining_hosts: state = "at_risk" else: state = "happy" else: state = "unhappy" tasks_by_state[state].add(task) if app.tasks: a_sync.block( asyncio.wait, [ asyncio.ensure_future(categorize_task(task)) for task in app.tasks ], ) return tasks_by_state
def get_tasks_by_state_for_app( app: MarathonApp, drain_method: drain_lib.DrainMethod, service: str, nerve_ns: str, bounce_health_params: Dict[str, Any], system_paasta_config: SystemPaastaConfig, log_deploy_error: LogDeployError, draining_hosts: Collection[str], ) -> TasksByStateDict: tasks_by_state: TasksByStateDict = { 'happy': set(), 'unhappy': set(), 'draining': set(), 'at_risk': set(), } happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns, system_paasta_config, **bounce_health_params) async def categorize_task(task: MarathonTask) -> None: try: is_draining = await drain_method.is_draining(task) except Exception as e: log_deploy_error( "Ignoring exception during is_draining of task %s:" " %s. Treating task as 'unhappy'." % (task, e), ) state = 'unhappy' else: if is_draining is True: state = 'draining' elif task in happy_tasks: if task.host in draining_hosts: state = 'at_risk' else: state = 'happy' else: state = 'unhappy' tasks_by_state[state].add(task) if app.tasks: a_sync.block( asyncio.wait, [ asyncio.ensure_future(categorize_task(task)) for task in app.tasks ], ) return tasks_by_state
def test_job_status_include_replicaset_non_verbose(mock_get_kubernetes_app_by_name): kstatus = {} a_sync.block( pik.job_status, kstatus=kstatus, client=mock.Mock(), job_config=mock.Mock(), pod_list=[], replicaset_list=[mock.Mock(), mock.Mock(), mock.Mock()], verbose=0, namespace=mock.Mock(), ) assert len(kstatus["replicasets"]) == 3
def filter_tasks_in_smartstack( tasks: Collection[MarathonTask], service: str, nerve_ns: str, system_paasta_config: SystemPaastaConfig, max_hosts_to_query: int = 20, haproxy_min_fraction_up: float = 1.0, ) -> List[MarathonTask]: all_hosts = list({t.host for t in tasks}) random.shuffle(all_hosts) # We select a random 20 hosts here. This should be enough most of the time: for services discovered at the habitat # level, in clusters with 2 habitats, there's about a 2 * (1/2) ** 20 ~= 2-per-million chance of not picking at # least one host in each habitat. For clusters with 3 habitats, the odds are about 3 * (2/3) ** 20 ~= 1-in-1000. # The only real effect would be that the bounce would decide to kill fewer old tasks, causing us to take another # round. If this becomes a problem, we can try to select tasks more intelligently. selected_hosts = all_hosts[:max_hosts_to_query] registered_task_count: typing.Counter[MarathonTask] = Counter() async def get_registered_tasks_on_host(host): try: registered_task_count.update( set( await a_sync.to_async(get_registered_marathon_tasks)( synapse_host=host, synapse_port=system_paasta_config.get_synapse_port(), synapse_haproxy_url_format=system_paasta_config.get_synapse_haproxy_url_format(), service=compose_job_id(service, nerve_ns), marathon_tasks=tasks, ) ) ) except (ConnectionError, RequestException): log.warning( f"Failed to connect to smartstack on {host}; this may cause us to consider tasks unhealthy." ) if selected_hosts: a_sync.block( asyncio.wait, [ asyncio.ensure_future(get_registered_tasks_on_host(host)) for host in selected_hosts ], timeout=30, ) threshold = len(selected_hosts) * haproxy_min_fraction_up return [t for t in tasks if registered_task_count[t] >= threshold]
def main(hostnames: Sequence[str]) -> None: master = get_mesos_master() try: mesos_state = block(master.state) except MasterNotAvailableException as e: print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) slaves = [ slave for slave in mesos_state.get("slaves", []) if slave["hostname"] in hostnames ] tasks = get_all_tasks_from_state(mesos_state, include_orphans=True) filtered_tasks = filter_tasks_for_slaves(slaves, tasks) resource_info_dict = calculate_resource_utilization_for_slaves( slaves, filtered_tasks) resource_utilizations = resource_utillizations_from_resource_info( total=resource_info_dict["total"], free=resource_info_dict["free"]) output = {} for metric in resource_utilizations: utilization = metric.total - metric.free if int(metric.total) == 0: utilization_perc = 100 else: utilization_perc = utilization / float(metric.total) * 100 output[metric.metric] = { "total": metric.total, "used": utilization, "perc": utilization_perc, } print(json.dumps(output))
def status_chronos_jobs(client, jobs, job_config, verbose): """Returns a formatted string of the status of a list of chronos jobs :param jobs: list of dicts of chronos job info as returned by the chronos client :param job_config: dict containing configuration about these jobs as provided by chronos_tools.load_chronos_job_config(). :param verbose: int verbosity level """ if jobs == []: return "%s: chronos job is not set up yet" % PaastaColors.yellow( "Warning") else: output = [] desired_state = job_config.get_desired_state_human() output.append("Desired: %s" % desired_state) for job in jobs: running_task_count = len( select_tasks_by_id( a_sync.block( get_cached_list_of_running_tasks_from_frameworks), job["name"], )) output.append( format_chronos_job_status(client, job, running_task_count, verbose)) return "\n".join(output)
def autoscale_service_configs( service_configs: Sequence[MarathonServiceConfig], system_paasta_config: SystemPaastaConfig, ) -> None: if autoscaling_is_paused(): log.warning("Skipping autoscaling because autoscaler paused") return marathon_clients = get_marathon_clients( get_marathon_servers(system_paasta_config)) apps_with_clients = get_marathon_apps_with_clients( marathon_clients.get_all_clients(), embed_tasks=True) all_mesos_tasks = a_sync.block(get_all_running_tasks) with ZookeeperPool(): for config in service_configs: try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( [app for (app, client) in apps_with_clients], all_mesos_tasks, config, system_paasta_config, ) autoscale_marathon_instance( config, system_paasta_config, list(marathon_tasks.values()), mesos_tasks, ) except Exception as e: write_to_log(config=config, line="Caught Exception %s" % e, level="debug")
def unreserve_all_resources(hostnames): """Dynamically unreserve all available resources on the specified hosts :param hostnames: list of hostnames to unreserve resources on """ mesos_state = a_sync.block(get_mesos_master().state_summary) components = hostnames_to_components(hostnames) hosts = components_to_hosts(components) known_slaves = [ slave for slave in mesos_state["slaves"] if slave["hostname"] in hosts ] for slave in known_slaves: hostname = slave["hostname"] log.info("Unreserving all resources on %s" % hostname) slave_id = slave["id"] resources = [] if MAINTENANCE_ROLE in slave["reserved_resources"]: for resource in ["disk", "mem", "cpus", "gpus"]: reserved_resource = slave["reserved_resources"][ MAINTENANCE_ROLE][resource] resources.append( Resource(name=resource, amount=reserved_resource)) try: unreserve(slave_id=slave_id, resources=resources) except HTTPError: raise HTTPError( f"Failed unreserving all of the resources on {hostname} ({slave_id}). Aborting." )
def _run_mesos_checks(mesos_master: MesosMaster, mesos_state: MesosState) -> Sequence[HealthCheckResult]: mesos_state_status = metastatus_lib.get_mesos_state_status(mesos_state) metrics = a_sync.block(mesos_master.metrics_snapshot) mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health( mesos_metrics=metrics, mesos_state=mesos_state) return mesos_state_status + mesos_metrics_status # type: ignore
def test_get_files_for_tasks_all(): mock_task = asynctest.MagicMock(spec=task.Task) mock_file = Mock() mock_file.exists = asynctest.CoroutineMock(return_value=True) mock_task.file.return_value = mock_file files = cluster.get_files_for_tasks([mock_task], ["myfile"], 1) files = a_sync.block(aiter_to_list, files) assert files == [mock_file]
def chronos_instance_status(instance_status, service, instance, verbose): cstatus = {} chronos_config = chronos_tools.load_chronos_config() client = chronos_tools.get_chronos_client(chronos_config) job_config = chronos_tools.load_chronos_job_config( service=service, instance=instance, cluster=settings.cluster, soa_dir=settings.soa_dir, ) cstatus['desired_state'] = job_config.get_desired_state() job_type = chronos_tools.get_job_type(job_config.config_dict) if job_type == chronos_tools.JobType.Scheduled: schedule_type = 'schedule' schedule = job_config.get_schedule() epsilon = job_config.get_epsilon() time_zone = job_config.get_schedule_time_zone() if time_zone == 'null' or time_zone is None: time_zone = 'UTC' cstatus['schedule'] = {} cstatus['schedule']['schedule'] = schedule cstatus['schedule']['epsilon'] = epsilon cstatus['schedule']['time_zone'] = time_zone elif job_type == chronos_tools.JobType.Dependent: schedule_type = 'parents' parents = job_config.get_parents() cstatus['parents'] = parents else: schedule_type = 'unknown' cstatus['schedule_type'] = schedule_type cstatus['status'] = {} if verbose: running_task_count = len( select_tasks_by_id( a_sync.block(get_cached_list_of_running_tasks_from_frameworks), job_config.get_job_name(), ), ) cstatus['status']['mesos_state'] = 'running' if running_task_count else 'not_running' cstatus['status']['disabled_state'] = 'not_scheduled' if job_config.get_disabled() else 'scheduled' cstatus['status']['chronos_state'] = chronos_tools.get_chronos_status_for_job(client, service, instance) cstatus['command'] = job_config.get_cmd() last_time, last_status = chronos_tools.get_status_last_run(job_config.config_dict) if last_status == chronos_tools.LastRunState.Success: last_status = 'success' elif last_status == chronos_tools.LastRunState.Fail: last_status = 'fail' elif last_status == chronos_tools.LastRunState.NotRun: last_status = 'not_run' else: last_status = '' if last_status == 'not_run' or last_status == '': last_time = 'never' cstatus['last_status'] = {} cstatus['last_status']['result'] = last_status cstatus['last_status']['time'] = last_time return cstatus
def _run_mesos_checks(mesos_master, mesos_state): mesos_state_status = metastatus_lib.get_mesos_state_status(mesos_state) metrics = a_sync.block(mesos_master.metrics_snapshot) mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health( mesos_metrics=metrics, mesos_state=mesos_state, ) return mesos_state_status + mesos_metrics_status
def get_mesos_tasks_and_slaves( system_paasta_config: SystemPaastaConfig, ) -> Tuple[Sequence[MarathonTask], List[Any]]: clients = get_marathon_clients(get_marathon_servers(system_paasta_config)) all_clients: Sequence[MarathonClient] = clients.get_all_clients() all_tasks: List[MarathonTask] = [] for client in all_clients: all_tasks.extend(client.list_tasks()) mesos_slaves = a_sync.block(get_slaves) return all_tasks, mesos_slaves
def test_get_files_for_tasks_some(): mock_task = asynctest.MagicMock(spec=task.Task) mock_file = Mock() mock_file_2 = Mock() mock_file.exists = asynctest.CoroutineMock(return_value=False) mock_file_2.exists = asynctest.CoroutineMock(return_value=True) mock_task.file.side_effect = [mock_file, mock_file_2] files = cluster.get_files_for_tasks([mock_task], ["myfile", "myotherfile"], 1) files = a_sync.block(aiter_to_list, files) assert files == [mock_file_2]
def assert_quorum_size(): masters, quorum = get_num_masters(), a_sync.block(get_mesos_quorum) if quorum_ok(masters, quorum): return HealthCheckResult( message="Quorum: masters: %d configured quorum: %d " % (masters, quorum), healthy=True, ) else: return HealthCheckResult( message="CRITICAL: Number of masters (%d) less than configured quorum(%d)." % (masters, quorum), healthy=False, )
def test_get_files_for_tasks_no_files(): attrs = {"id": "foo"} mock_task = asynctest.MagicMock(spec=task.Task) mock_task.__getitem__.side_effect = lambda x: attrs[x] mock_file = Mock() mock_file.exists = asynctest.CoroutineMock(return_value=False) mock_task.file.return_value = mock_file files = cluster.get_files_for_tasks([mock_task], ["myfile"], 1) with raises(exceptions.FileNotFoundForTaskException) as excinfo: files = a_sync.block(aiter_to_list, files) assert "None of the tasks in foo contain the files in list myfile" in str( excinfo.value)
def instance_tasks(request): status = instance_status(request) slave_hostname = request.swagger_data.get('slave_hostname', None) verbose = request.swagger_data.get('verbose', False) try: mstatus = status['marathon'] except KeyError: raise ApiFailure("Only marathon tasks supported", 400) tasks = a_sync.block(get_tasks_from_app_id, mstatus['app_id'], slave_hostname=slave_hostname) if verbose: tasks = [add_executor_info(task) for task in tasks] tasks = [add_slave_info(task) for task in tasks] return [task._Task__items for task in tasks]
def check_registration(threshold_percentage): try: mesos_state = block(get_mesos_master().state) except MasterNotAvailableException as e: print("Could not find Mesos Master: %s" % e.message) sys.exit(1) config = load_system_paasta_config() autoscaling_resources = config.get_cluster_autoscaling_resources() for resource in autoscaling_resources.values(): print("Checking %s" % resource["id"]) try: scaler = get_scaler(resource["type"])( resource=resource, pool_settings=None, config_folder=None, dry_run=True, utilization_error=0.0, max_increase=0.0, max_decrease=0.0, ) except KeyError: print("Couldn't find a metric provider for resource of type: {}". format(resource["type"])) continue if len(scaler.instances) == 0: print("No instances for this resource") continue elif scaler.is_new_autoscaling_resource(): # See OPS-13784 threshold = config.get_monitoring_config().get( "check_registered_slave_threshold") print(f"Autoscaling resource was created within last {threshold}" " seconds and would probably fail this check") continue else: slaves = scaler.get_aws_slaves(mesos_state) percent_registered = ( float(float(len(slaves)) / float(len(scaler.instances))) * 100) if percent_registered < float(threshold_percentage): print( "CRIT: Only found {}% of instances in {} registered in mesos. " "Please check for puppet or AMI baking problems!".format( percent_registered, resource["id"])) return False print( "OK: Found more than {}% of instances registered for all paasta resources in this " "superregion".format(threshold_percentage)) return True
def get_autoscaling_info(apps_with_clients, service_config): if ( service_config.get_max_instances() and service_config.get_desired_state() == "start" ): all_mesos_tasks = a_sync.block(get_cached_list_of_running_tasks_from_frameworks) autoscaling_params = service_config.get_autoscaling_params() autoscaling_params.update({"noop": True}) system_paasta_config = load_system_paasta_config() try: marathon_tasks, mesos_tasks = filter_autoscaling_tasks( [app for (app, client) in apps_with_clients], all_mesos_tasks, service_config, system_paasta_config, ) utilization = get_utilization( marathon_service_config=service_config, system_paasta_config=system_paasta_config, autoscaling_params=autoscaling_params, log_utilization_data={}, marathon_tasks=list(marathon_tasks.values()), mesos_tasks=mesos_tasks, ) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params["setpoint"], current_instances=service_config.get_instances(), ) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=service_config.get_instances(), marathon_service_config=service_config, num_healthy_instances=len(marathon_tasks), persist_data=False, ) except MetricsProviderNoDataError: utilization = None new_instance_count = None return ServiceAutoscalingInfo( current_instances=service_config.get_instances(), max_instances=service_config.get_max_instances(), min_instances=service_config.get_min_instances(), current_utilization=utilization, target_instances=new_instance_count, ) return None
def _clean_up_paasta_native_frameworks(context): clear_mesos_tools_cache() # context.etc_paasta signals that we actually have configured the mesos-cli.json; without this, we don't know where # to connect to clean up paasta native frameworks. if hasattr(context, "etc_paasta"): for framework in a_sync.block( mesos_tools.get_mesos_master().frameworks, active_only=True): if framework.name.startswith( "paasta_native ") or framework.name == getattr( context, "framework_name", ""): paasta_print("cleaning up framework %s" % framework.name) try: mesos_tools.terminate_framework(framework.id) except requests.exceptions.HTTPError as e: paasta_print( f"Got exception when terminating framework {framework.id}: {e}" )
def check_mesos_no_duplicate_frameworks() -> None: options = parse_args() check = options.check.split(",") master = get_mesos_master() try: state = block(master.state) except MasterNotAvailableException as e: print("CRITICAL: %s" % e.args[0]) sys.exit(2) result = assert_no_duplicate_frameworks(state, check) if result.healthy: print("OK: " + result.message) sys.exit(0) else: print(result.message) sys.exit(2)
def check_mesos_active_frameworks() -> None: options = parse_args() expected = options.expected.split(',') master = get_mesos_master() try: state = block(master.state) except MasterNotAvailableException as e: paasta_print("CRITICAL: %s" % e.args[0]) sys.exit(2) result = assert_frameworks_exist(state, expected) if result.healthy: paasta_print("OK: " + result.message) sys.exit(0) else: paasta_print(result.message) sys.exit(2)
def instance_task(request): status = instance_status(request) task_id = request.swagger_data.get('task_id', None) verbose = request.swagger_data.get('verbose', False) try: mstatus = status['marathon'] except KeyError: raise ApiFailure("Only marathon tasks supported", 400) try: task = a_sync.block(get_task, task_id, app_id=mstatus['app_id']) except TaskNotFound: raise ApiFailure(f"Task with id {task_id} not found", 404) except Exception: error_message = traceback.format_exc() raise ApiFailure(error_message, 500) if verbose: task = add_slave_info(task) task = add_executor_info(task) return task._Task__items
def status_mesos_tasks( service: str, instance: str, normal_instance_count: int, verbose: int, ) -> str: job_id = marathon_tools.format_job_id(service, instance) # We have to add a spacer at the end to make sure we only return # things for service.main and not service.main_foo filter_string = f"{job_id}{marathon_tools.MESOS_TASK_SPACER}" try: count = len( select_tasks_by_id( a_sync.block(get_cached_list_of_running_tasks_from_frameworks), filter_string)) if count >= normal_instance_count: status = PaastaColors.green("Healthy") count_str = PaastaColors.green("(%d/%d)" % (count, normal_instance_count)) elif count == 0: status = PaastaColors.red("Critical") count_str = PaastaColors.red("(%d/%d)" % (count, normal_instance_count)) else: status = PaastaColors.yellow("Warning") count_str = PaastaColors.yellow("(%d/%d)" % (count, normal_instance_count)) running_string = PaastaColors.bold('TASK_RUNNING') output = f"Mesos: {status} - {count_str} tasks in the {running_string} state." except ReadTimeout: return "Error: talking to Mesos timed out. It may be overloaded." if verbose > 0: tail_lines = calculate_tail_lines(verbose_level=verbose) output += '\n' + status_mesos_tasks_verbose( filter_string=filter_string, get_short_task_id=get_short_task_id, tail_lines=tail_lines, ) return output
def resources_utilization(request): master = get_mesos_master() mesos_state = block(master.state) groupings = request.swagger_data.get('groupings', ['superregion']) # swagger actually makes the key None if it's not set if groupings is None: groupings = ['superregion'] grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings) sorting_function = metastatus_lib.sort_func_for_attributes(groupings) filters = request.swagger_data.get('filter', []) filters = parse_filters(filters) filter_funcs = [ metastatus_lib.make_filter_slave_func(attr, vals) for attr, vals in filters.items() ] resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_func=grouping_function, mesos_state=mesos_state, filters=filter_funcs, sort_func=sorting_function, ) response_body = [] for k, v in resource_info_dict.items(): group = {'groupings': {}} for grouping, value in k: group['groupings'][grouping] = value for resource, value in v['total']._asdict().items(): group[resource] = {'total': value} for resource, value in v['free']._asdict().items(): group[resource]['free'] = value for resource in v['free']._fields: group[resource][ 'used'] = group[resource]['total'] - group[resource]['free'] response_body.append(group) return Response(json_body=response_body, status_code=200)
def main(): args = parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() clients = marathon_tools.get_marathon_clients( marathon_tools.get_marathon_servers(system_paasta_config)) all_clients = clients.get_all_clients() all_tasks = [] for client in all_clients: all_tasks.extend(client.list_tasks()) mesos_slaves = a_sync.block(get_slaves) smartstack_replication_checker = MesosSmartstackReplicationChecker( mesos_slaves, system_paasta_config) for service in list_services(soa_dir=args.soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=args.soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=marathon_tools.MarathonServiceConfig, ): if instance_config.get_docker_image(): check_service_replication( instance_config=instance_config, all_tasks=all_tasks, smartstack_replication_checker= smartstack_replication_checker, ) else: log.debug( '%s is not deployed. Skipping replication monitoring.' % instance_config.job_id, )