Ejemplo n.º 1
0
def marathon_job_status(mstatus, client, job_config, verbose):
    try:
        app_id = job_config.format_marathon_app_dict()['id']
    except NoDockerImageError:
        error_msg = "Docker image is not in deployments.json."
        mstatus['error_message'] = error_msg
        return

    mstatus['app_id'] = app_id
    if verbose is True:
        mstatus['slaves'] = list(
            {a_sync.block(task.slave)['hostname'] for task in a_sync.block(get_running_tasks_from_frameworks, app_id)},
        )
    mstatus['expected_instance_count'] = job_config.get_instances()

    try:
        app = client.get_app(app_id)
    except marathon.exceptions.NotFoundError:
        mstatus['deploy_status'] = marathon_tools.MarathonDeployStatus.tostring(
            marathon_tools.MarathonDeployStatus.NotRunning,
        )
        mstatus['running_instance_count'] = 0
    else:
        deploy_status = marathon_tools.get_marathon_app_deploy_status(client, app)
        mstatus['deploy_status'] = marathon_tools.MarathonDeployStatus.tostring(deploy_status)
        # by comparing running count with expected count, callers can figure
        # out if the instance is in Healthy, Warning or Critical state.
        mstatus['running_instance_count'] = app.tasks_running

        if deploy_status == marathon_tools.MarathonDeployStatus.Delayed:
            _, backoff_seconds = marathon_tools.get_app_queue_status(client, app_id)
            mstatus['backoff_seconds'] = backoff_seconds
Ejemplo n.º 2
0
def undrain_tasks(
    to_undrain: Collection[MarathonTask],
    leave_draining: Collection[MarathonTask],
    drain_method: drain_lib.DrainMethod,
    log_deploy_error: LogDeployError,
) -> None:
    # If any tasks on the new app happen to be draining (e.g. someone reverts to an older version with
    # `paasta mark-for-deployment`), then we should undrain them.

    async def undrain_task(task: MarathonTask) -> None:
        if task not in leave_draining:
            if task.state == 'TASK_UNREACHABLE':
                return
            try:
                await drain_method.stop_draining(task)
            except Exception as e:
                log_deploy_error(
                    "Ignoring exception during stop_draining of task %s: %s." %
                    (task, e))

    if to_undrain:
        a_sync.block(
            asyncio.wait,
            [asyncio.ensure_future(undrain_task(task)) for task in to_undrain],
        )
Ejemplo n.º 3
0
    def kill_tasks_if_necessary(self, driver: MesosSchedulerDriver):
        base_task = self.service_config.base_task(self.system_paasta_config)

        all_tasks_with_params = self.task_store.get_all_tasks()

        new_tasks_with_params = self.get_new_tasks(base_task['name'], all_tasks_with_params)
        happy_new_tasks_with_params = self.get_happy_tasks(new_tasks_with_params)

        desired_instances = self.service_config.get_desired_instances()
        # this puts the most-desired tasks first. I would have left them in order of bad->good and used
        # new_tasks_by_desirability[:-desired_instances] instead, but list[:-0] is an empty list, rather than the full
        # list.
        new_task_ids_by_desirability = sorted(
            list(new_tasks_with_params.keys()),
            key=self.make_healthiness_sorter(base_task['name'], all_tasks_with_params),
            reverse=True,
        )
        new_task_ids_to_kill = new_task_ids_by_desirability[desired_instances:]

        old_tasks_with_params = self.get_old_tasks(base_task['name'], all_tasks_with_params)
        old_draining_tasks_with_params = self.get_draining_tasks(old_tasks_with_params)
        old_non_draining_tasks = sorted(
            list(
                set(old_tasks_with_params.keys()) -
                set(old_draining_tasks_with_params),
            ),
            key=self.make_healthiness_sorter(base_task['name'], all_tasks_with_params),
            reverse=True,
        )

        actions = bounce_lib.crossover_bounce(
            new_config={"instances": desired_instances},
            new_app_running=True,
            happy_new_tasks=happy_new_tasks_with_params.keys(),
            old_non_draining_tasks=new_task_ids_to_kill + old_non_draining_tasks,
        )

        with a_sync.idle_event_loop():
            futures = []
            for task in set(new_tasks_with_params.keys()) - set(actions['tasks_to_drain']):
                futures.append(asyncio.ensure_future(self.undrain_task(task)))
            for task in actions['tasks_to_drain']:
                futures.append(asyncio.ensure_future(self.drain_task(task)))

            if futures:
                a_sync.block(asyncio.wait, futures)

            async def kill_if_safe_to_kill(task_id: str):
                if await self.drain_method.is_safe_to_kill(self.make_drain_task(task_id)):
                    self.kill_task(driver, task_id)

            futures = []
            for task, parameters in all_tasks_with_params.items():
                if parameters.is_draining and parameters.mesos_task_state in LIVE_TASK_STATES:
                    futures.append(asyncio.ensure_future(kill_if_safe_to_kill(task)))
            if futures:
                a_sync.block(asyncio.wait, futures)
Ejemplo n.º 4
0
def get_count_running_tasks_on_slave(hostname):
    """Return the number of tasks running on a paticular slave
    or 0 if the slave is not found.
    :param hostname: hostname of the slave
    :returns: integer count of mesos tasks"""
    mesos_state = a_sync.block(get_mesos_master().state_summary)
    task_counts = a_sync.block(get_mesos_task_count_by_slave, mesos_state)
    counts = [slave['task_counts'].count for slave in task_counts if slave['task_counts'].slave['hostname'] == hostname]
    if counts:
        return counts[0]
    else:
        return 0
Ejemplo n.º 5
0
def get_tasks_by_state_for_app(
    app: MarathonApp,
    drain_method: drain_lib.DrainMethod,
    service: str,
    nerve_ns: str,
    bounce_health_params: Dict[str, Any],
    system_paasta_config: SystemPaastaConfig,
    log_deploy_error: LogDeployError,
    draining_hosts: Collection[str],
) -> TasksByStateDict:
    tasks_by_state: TasksByStateDict = {
        "happy": set(),
        "unhappy": set(),
        "draining": set(),
        "at_risk": set(),
    }

    happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns,
                                             system_paasta_config,
                                             **bounce_health_params)

    async def categorize_task(task: MarathonTask) -> None:
        try:
            is_draining = await drain_method.is_draining(task)
        except Exception as e:
            log_deploy_error(
                f"Ignoring {type(e).__name__} exception during is_draining of task "
                f"{task.id} {e.args}. Treating task as 'unhappy'.")
            state = "unhappy"
        else:
            if is_draining is True:
                state = "draining"
            elif task in happy_tasks:
                if task.host in draining_hosts:
                    state = "at_risk"
                else:
                    state = "happy"
            else:
                state = "unhappy"
        tasks_by_state[state].add(task)

    if app.tasks:
        a_sync.block(
            asyncio.wait,
            [
                asyncio.ensure_future(categorize_task(task))
                for task in app.tasks
            ],
        )

    return tasks_by_state
Ejemplo n.º 6
0
def get_tasks_by_state_for_app(
    app: MarathonApp,
    drain_method: drain_lib.DrainMethod,
    service: str,
    nerve_ns: str,
    bounce_health_params: Dict[str, Any],
    system_paasta_config: SystemPaastaConfig,
    log_deploy_error: LogDeployError,
    draining_hosts: Collection[str],
) -> TasksByStateDict:
    tasks_by_state: TasksByStateDict = {
        'happy': set(),
        'unhappy': set(),
        'draining': set(),
        'at_risk': set(),
    }

    happy_tasks = bounce_lib.get_happy_tasks(app, service, nerve_ns,
                                             system_paasta_config,
                                             **bounce_health_params)

    async def categorize_task(task: MarathonTask) -> None:
        try:
            is_draining = await drain_method.is_draining(task)
        except Exception as e:
            log_deploy_error(
                "Ignoring exception during is_draining of task %s:"
                " %s. Treating task as 'unhappy'." % (task, e), )
            state = 'unhappy'
        else:
            if is_draining is True:
                state = 'draining'
            elif task in happy_tasks:
                if task.host in draining_hosts:
                    state = 'at_risk'
                else:
                    state = 'happy'
            else:
                state = 'unhappy'
        tasks_by_state[state].add(task)

    if app.tasks:
        a_sync.block(
            asyncio.wait,
            [
                asyncio.ensure_future(categorize_task(task))
                for task in app.tasks
            ],
        )

    return tasks_by_state
Ejemplo n.º 7
0
def test_job_status_include_replicaset_non_verbose(mock_get_kubernetes_app_by_name):
    kstatus = {}
    a_sync.block(
        pik.job_status,
        kstatus=kstatus,
        client=mock.Mock(),
        job_config=mock.Mock(),
        pod_list=[],
        replicaset_list=[mock.Mock(), mock.Mock(), mock.Mock()],
        verbose=0,
        namespace=mock.Mock(),
    )

    assert len(kstatus["replicasets"]) == 3
def filter_tasks_in_smartstack(
    tasks: Collection[MarathonTask],
    service: str,
    nerve_ns: str,
    system_paasta_config: SystemPaastaConfig,
    max_hosts_to_query: int = 20,
    haproxy_min_fraction_up: float = 1.0,
) -> List[MarathonTask]:
    all_hosts = list({t.host for t in tasks})
    random.shuffle(all_hosts)
    # We select a random 20 hosts here. This should be enough most of the time: for services discovered at the habitat
    # level, in clusters with 2 habitats, there's about a 2 * (1/2) ** 20 ~= 2-per-million chance of not picking at
    # least one host in each habitat. For clusters with 3 habitats, the odds are about 3 * (2/3) ** 20 ~= 1-in-1000.
    # The only real effect would be that the bounce would decide to kill fewer old tasks, causing us to take another
    # round. If this becomes a problem, we can try to select tasks more intelligently.

    selected_hosts = all_hosts[:max_hosts_to_query]
    registered_task_count: typing.Counter[MarathonTask] = Counter()

    async def get_registered_tasks_on_host(host):
        try:
            registered_task_count.update(
                set(
                    await a_sync.to_async(get_registered_marathon_tasks)(
                        synapse_host=host,
                        synapse_port=system_paasta_config.get_synapse_port(),
                        synapse_haproxy_url_format=system_paasta_config.get_synapse_haproxy_url_format(),
                        service=compose_job_id(service, nerve_ns),
                        marathon_tasks=tasks,
                    )
                )
            )
        except (ConnectionError, RequestException):
            log.warning(
                f"Failed to connect to smartstack on {host}; this may cause us to consider tasks unhealthy."
            )

    if selected_hosts:
        a_sync.block(
            asyncio.wait,
            [
                asyncio.ensure_future(get_registered_tasks_on_host(host))
                for host in selected_hosts
            ],
            timeout=30,
        )

    threshold = len(selected_hosts) * haproxy_min_fraction_up
    return [t for t in tasks if registered_task_count[t] >= threshold]
Ejemplo n.º 9
0
def main(hostnames: Sequence[str]) -> None:
    master = get_mesos_master()
    try:
        mesos_state = block(master.state)
    except MasterNotAvailableException as e:
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)
    slaves = [
        slave for slave in mesos_state.get("slaves", [])
        if slave["hostname"] in hostnames
    ]
    tasks = get_all_tasks_from_state(mesos_state, include_orphans=True)
    filtered_tasks = filter_tasks_for_slaves(slaves, tasks)
    resource_info_dict = calculate_resource_utilization_for_slaves(
        slaves, filtered_tasks)
    resource_utilizations = resource_utillizations_from_resource_info(
        total=resource_info_dict["total"], free=resource_info_dict["free"])
    output = {}
    for metric in resource_utilizations:
        utilization = metric.total - metric.free
        if int(metric.total) == 0:
            utilization_perc = 100
        else:
            utilization_perc = utilization / float(metric.total) * 100
        output[metric.metric] = {
            "total": metric.total,
            "used": utilization,
            "perc": utilization_perc,
        }
    print(json.dumps(output))
Ejemplo n.º 10
0
def status_chronos_jobs(client, jobs, job_config, verbose):
    """Returns a formatted string of the status of a list of chronos jobs

    :param jobs: list of dicts of chronos job info as returned by the chronos
        client
    :param job_config: dict containing configuration about these jobs as
        provided by chronos_tools.load_chronos_job_config().
    :param verbose: int verbosity level
    """
    if jobs == []:
        return "%s: chronos job is not set up yet" % PaastaColors.yellow(
            "Warning")
    else:
        output = []
        desired_state = job_config.get_desired_state_human()
        output.append("Desired:    %s" % desired_state)
        for job in jobs:
            running_task_count = len(
                select_tasks_by_id(
                    a_sync.block(
                        get_cached_list_of_running_tasks_from_frameworks),
                    job["name"],
                ))
            output.append(
                format_chronos_job_status(client, job, running_task_count,
                                          verbose))
        return "\n".join(output)
Ejemplo n.º 11
0
def autoscale_service_configs(
    service_configs: Sequence[MarathonServiceConfig],
    system_paasta_config: SystemPaastaConfig,
) -> None:
    if autoscaling_is_paused():
        log.warning("Skipping autoscaling because autoscaler paused")
        return

    marathon_clients = get_marathon_clients(
        get_marathon_servers(system_paasta_config))
    apps_with_clients = get_marathon_apps_with_clients(
        marathon_clients.get_all_clients(), embed_tasks=True)
    all_mesos_tasks = a_sync.block(get_all_running_tasks)
    with ZookeeperPool():
        for config in service_configs:
            try:
                marathon_tasks, mesos_tasks = filter_autoscaling_tasks(
                    [app for (app, client) in apps_with_clients],
                    all_mesos_tasks,
                    config,
                    system_paasta_config,
                )
                autoscale_marathon_instance(
                    config,
                    system_paasta_config,
                    list(marathon_tasks.values()),
                    mesos_tasks,
                )
            except Exception as e:
                write_to_log(config=config,
                             line="Caught Exception %s" % e,
                             level="debug")
Ejemplo n.º 12
0
def unreserve_all_resources(hostnames):
    """Dynamically unreserve all available resources on the specified hosts
    :param hostnames: list of hostnames to unreserve resources on
    """
    mesos_state = a_sync.block(get_mesos_master().state_summary)
    components = hostnames_to_components(hostnames)
    hosts = components_to_hosts(components)
    known_slaves = [
        slave for slave in mesos_state["slaves"] if slave["hostname"] in hosts
    ]
    for slave in known_slaves:
        hostname = slave["hostname"]
        log.info("Unreserving all resources on %s" % hostname)
        slave_id = slave["id"]
        resources = []
        if MAINTENANCE_ROLE in slave["reserved_resources"]:
            for resource in ["disk", "mem", "cpus", "gpus"]:
                reserved_resource = slave["reserved_resources"][
                    MAINTENANCE_ROLE][resource]
                resources.append(
                    Resource(name=resource, amount=reserved_resource))
            try:
                unreserve(slave_id=slave_id, resources=resources)
            except HTTPError:
                raise HTTPError(
                    f"Failed unreserving all of the resources on {hostname} ({slave_id}). Aborting."
                )
Ejemplo n.º 13
0
def _run_mesos_checks(mesos_master: MesosMaster,
                      mesos_state: MesosState) -> Sequence[HealthCheckResult]:
    mesos_state_status = metastatus_lib.get_mesos_state_status(mesos_state)

    metrics = a_sync.block(mesos_master.metrics_snapshot)
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(
        mesos_metrics=metrics, mesos_state=mesos_state)
    return mesos_state_status + mesos_metrics_status  # type: ignore
Ejemplo n.º 14
0
def test_get_files_for_tasks_all():
    mock_task = asynctest.MagicMock(spec=task.Task)
    mock_file = Mock()
    mock_file.exists = asynctest.CoroutineMock(return_value=True)
    mock_task.file.return_value = mock_file
    files = cluster.get_files_for_tasks([mock_task], ["myfile"], 1)
    files = a_sync.block(aiter_to_list, files)
    assert files == [mock_file]
Ejemplo n.º 15
0
def chronos_instance_status(instance_status, service, instance, verbose):
    cstatus = {}
    chronos_config = chronos_tools.load_chronos_config()
    client = chronos_tools.get_chronos_client(chronos_config)
    job_config = chronos_tools.load_chronos_job_config(
        service=service,
        instance=instance,
        cluster=settings.cluster,
        soa_dir=settings.soa_dir,
    )
    cstatus['desired_state'] = job_config.get_desired_state()
    job_type = chronos_tools.get_job_type(job_config.config_dict)
    if job_type == chronos_tools.JobType.Scheduled:
        schedule_type = 'schedule'
        schedule = job_config.get_schedule()
        epsilon = job_config.get_epsilon()
        time_zone = job_config.get_schedule_time_zone()
        if time_zone == 'null' or time_zone is None:
            time_zone = 'UTC'
        cstatus['schedule'] = {}
        cstatus['schedule']['schedule'] = schedule
        cstatus['schedule']['epsilon'] = epsilon
        cstatus['schedule']['time_zone'] = time_zone
    elif job_type == chronos_tools.JobType.Dependent:
        schedule_type = 'parents'
        parents = job_config.get_parents()
        cstatus['parents'] = parents
    else:
        schedule_type = 'unknown'
    cstatus['schedule_type'] = schedule_type
    cstatus['status'] = {}
    if verbose:
        running_task_count = len(
            select_tasks_by_id(
                a_sync.block(get_cached_list_of_running_tasks_from_frameworks),
                job_config.get_job_name(),
            ),
        )
        cstatus['status']['mesos_state'] = 'running' if running_task_count else 'not_running'
    cstatus['status']['disabled_state'] = 'not_scheduled' if job_config.get_disabled() else 'scheduled'
    cstatus['status']['chronos_state'] = chronos_tools.get_chronos_status_for_job(client, service, instance)
    cstatus['command'] = job_config.get_cmd()
    last_time, last_status = chronos_tools.get_status_last_run(job_config.config_dict)
    if last_status == chronos_tools.LastRunState.Success:
        last_status = 'success'
    elif last_status == chronos_tools.LastRunState.Fail:
        last_status = 'fail'
    elif last_status == chronos_tools.LastRunState.NotRun:
        last_status = 'not_run'
    else:
        last_status = ''
    if last_status == 'not_run' or last_status == '':
        last_time = 'never'
    cstatus['last_status'] = {}
    cstatus['last_status']['result'] = last_status
    cstatus['last_status']['time'] = last_time

    return cstatus
Ejemplo n.º 16
0
def _run_mesos_checks(mesos_master, mesos_state):
    mesos_state_status = metastatus_lib.get_mesos_state_status(mesos_state)

    metrics = a_sync.block(mesos_master.metrics_snapshot)
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(
        mesos_metrics=metrics,
        mesos_state=mesos_state,
    )
    return mesos_state_status + mesos_metrics_status
Ejemplo n.º 17
0
def get_mesos_tasks_and_slaves(
    system_paasta_config: SystemPaastaConfig,
) -> Tuple[Sequence[MarathonTask], List[Any]]:
    clients = get_marathon_clients(get_marathon_servers(system_paasta_config))
    all_clients: Sequence[MarathonClient] = clients.get_all_clients()
    all_tasks: List[MarathonTask] = []
    for client in all_clients:
        all_tasks.extend(client.list_tasks())
    mesos_slaves = a_sync.block(get_slaves)

    return all_tasks, mesos_slaves
Ejemplo n.º 18
0
def test_get_files_for_tasks_some():
    mock_task = asynctest.MagicMock(spec=task.Task)
    mock_file = Mock()
    mock_file_2 = Mock()
    mock_file.exists = asynctest.CoroutineMock(return_value=False)
    mock_file_2.exists = asynctest.CoroutineMock(return_value=True)
    mock_task.file.side_effect = [mock_file, mock_file_2]
    files = cluster.get_files_for_tasks([mock_task], ["myfile", "myotherfile"],
                                        1)
    files = a_sync.block(aiter_to_list, files)
    assert files == [mock_file_2]
Ejemplo n.º 19
0
def assert_quorum_size():
    masters, quorum = get_num_masters(), a_sync.block(get_mesos_quorum)
    if quorum_ok(masters, quorum):
        return HealthCheckResult(
            message="Quorum: masters: %d configured quorum: %d " % (masters, quorum),
            healthy=True,
        )
    else:
        return HealthCheckResult(
            message="CRITICAL: Number of masters (%d) less than configured quorum(%d)." % (masters, quorum),
            healthy=False,
        )
Ejemplo n.º 20
0
def test_get_files_for_tasks_no_files():
    attrs = {"id": "foo"}
    mock_task = asynctest.MagicMock(spec=task.Task)
    mock_task.__getitem__.side_effect = lambda x: attrs[x]
    mock_file = Mock()
    mock_file.exists = asynctest.CoroutineMock(return_value=False)
    mock_task.file.return_value = mock_file
    files = cluster.get_files_for_tasks([mock_task], ["myfile"], 1)
    with raises(exceptions.FileNotFoundForTaskException) as excinfo:
        files = a_sync.block(aiter_to_list, files)
    assert "None of the tasks in foo contain the files in list myfile" in str(
        excinfo.value)
Ejemplo n.º 21
0
def instance_tasks(request):
    status = instance_status(request)
    slave_hostname = request.swagger_data.get('slave_hostname', None)
    verbose = request.swagger_data.get('verbose', False)
    try:
        mstatus = status['marathon']
    except KeyError:
        raise ApiFailure("Only marathon tasks supported", 400)
    tasks = a_sync.block(get_tasks_from_app_id, mstatus['app_id'], slave_hostname=slave_hostname)
    if verbose:
        tasks = [add_executor_info(task) for task in tasks]
        tasks = [add_slave_info(task) for task in tasks]
    return [task._Task__items for task in tasks]
def check_registration(threshold_percentage):
    try:
        mesos_state = block(get_mesos_master().state)
    except MasterNotAvailableException as e:
        print("Could not find Mesos Master: %s" % e.message)
        sys.exit(1)

    config = load_system_paasta_config()
    autoscaling_resources = config.get_cluster_autoscaling_resources()
    for resource in autoscaling_resources.values():
        print("Checking %s" % resource["id"])
        try:
            scaler = get_scaler(resource["type"])(
                resource=resource,
                pool_settings=None,
                config_folder=None,
                dry_run=True,
                utilization_error=0.0,
                max_increase=0.0,
                max_decrease=0.0,
            )
        except KeyError:
            print("Couldn't find a metric provider for resource of type: {}".
                  format(resource["type"]))
            continue
        if len(scaler.instances) == 0:
            print("No instances for this resource")
            continue
        elif scaler.is_new_autoscaling_resource():
            # See OPS-13784
            threshold = config.get_monitoring_config().get(
                "check_registered_slave_threshold")
            print(f"Autoscaling resource was created within last {threshold}"
                  " seconds and would probably fail this check")
            continue
        else:
            slaves = scaler.get_aws_slaves(mesos_state)
            percent_registered = (
                float(float(len(slaves)) / float(len(scaler.instances))) * 100)
            if percent_registered < float(threshold_percentage):
                print(
                    "CRIT: Only found {}% of instances in {} registered in mesos. "
                    "Please check for puppet or AMI baking problems!".format(
                        percent_registered, resource["id"]))
                return False
    print(
        "OK: Found more than {}% of instances registered for all paasta resources in this "
        "superregion".format(threshold_percentage))
    return True
def get_autoscaling_info(apps_with_clients, service_config):
    if (
        service_config.get_max_instances()
        and service_config.get_desired_state() == "start"
    ):
        all_mesos_tasks = a_sync.block(get_cached_list_of_running_tasks_from_frameworks)
        autoscaling_params = service_config.get_autoscaling_params()
        autoscaling_params.update({"noop": True})
        system_paasta_config = load_system_paasta_config()
        try:
            marathon_tasks, mesos_tasks = filter_autoscaling_tasks(
                [app for (app, client) in apps_with_clients],
                all_mesos_tasks,
                service_config,
                system_paasta_config,
            )
            utilization = get_utilization(
                marathon_service_config=service_config,
                system_paasta_config=system_paasta_config,
                autoscaling_params=autoscaling_params,
                log_utilization_data={},
                marathon_tasks=list(marathon_tasks.values()),
                mesos_tasks=mesos_tasks,
            )
            error = get_error_from_utilization(
                utilization=utilization,
                setpoint=autoscaling_params["setpoint"],
                current_instances=service_config.get_instances(),
            )
            new_instance_count = get_new_instance_count(
                utilization=utilization,
                error=error,
                autoscaling_params=autoscaling_params,
                current_instances=service_config.get_instances(),
                marathon_service_config=service_config,
                num_healthy_instances=len(marathon_tasks),
                persist_data=False,
            )
        except MetricsProviderNoDataError:
            utilization = None
            new_instance_count = None
        return ServiceAutoscalingInfo(
            current_instances=service_config.get_instances(),
            max_instances=service_config.get_max_instances(),
            min_instances=service_config.get_min_instances(),
            current_utilization=utilization,
            target_instances=new_instance_count,
        )
    return None
Ejemplo n.º 24
0
def _clean_up_paasta_native_frameworks(context):
    clear_mesos_tools_cache()
    # context.etc_paasta signals that we actually have configured the mesos-cli.json; without this, we don't know where
    # to connect to clean up paasta native frameworks.
    if hasattr(context, "etc_paasta"):
        for framework in a_sync.block(
                mesos_tools.get_mesos_master().frameworks, active_only=True):
            if framework.name.startswith(
                    "paasta_native ") or framework.name == getattr(
                        context, "framework_name", ""):
                paasta_print("cleaning up framework %s" % framework.name)
                try:
                    mesos_tools.terminate_framework(framework.id)
                except requests.exceptions.HTTPError as e:
                    paasta_print(
                        f"Got exception when terminating framework {framework.id}: {e}"
                    )
Ejemplo n.º 25
0
def check_mesos_no_duplicate_frameworks() -> None:
    options = parse_args()
    check = options.check.split(",")
    master = get_mesos_master()
    try:
        state = block(master.state)
    except MasterNotAvailableException as e:
        print("CRITICAL: %s" % e.args[0])
        sys.exit(2)

    result = assert_no_duplicate_frameworks(state, check)
    if result.healthy:
        print("OK: " + result.message)
        sys.exit(0)
    else:
        print(result.message)
        sys.exit(2)
Ejemplo n.º 26
0
def check_mesos_active_frameworks() -> None:
    options = parse_args()
    expected = options.expected.split(',')
    master = get_mesos_master()
    try:
        state = block(master.state)
    except MasterNotAvailableException as e:
        paasta_print("CRITICAL: %s" % e.args[0])
        sys.exit(2)

    result = assert_frameworks_exist(state, expected)
    if result.healthy:
        paasta_print("OK: " + result.message)
        sys.exit(0)
    else:
        paasta_print(result.message)
        sys.exit(2)
Ejemplo n.º 27
0
def instance_task(request):
    status = instance_status(request)
    task_id = request.swagger_data.get('task_id', None)
    verbose = request.swagger_data.get('verbose', False)
    try:
        mstatus = status['marathon']
    except KeyError:
        raise ApiFailure("Only marathon tasks supported", 400)
    try:
        task = a_sync.block(get_task, task_id, app_id=mstatus['app_id'])
    except TaskNotFound:
        raise ApiFailure(f"Task with id {task_id} not found", 404)
    except Exception:
        error_message = traceback.format_exc()
        raise ApiFailure(error_message, 500)
    if verbose:
        task = add_slave_info(task)
        task = add_executor_info(task)
    return task._Task__items
Ejemplo n.º 28
0
def status_mesos_tasks(
    service: str,
    instance: str,
    normal_instance_count: int,
    verbose: int,
) -> str:
    job_id = marathon_tools.format_job_id(service, instance)
    # We have to add a spacer at the end to make sure we only return
    # things for service.main and not service.main_foo
    filter_string = f"{job_id}{marathon_tools.MESOS_TASK_SPACER}"

    try:
        count = len(
            select_tasks_by_id(
                a_sync.block(get_cached_list_of_running_tasks_from_frameworks),
                filter_string))
        if count >= normal_instance_count:
            status = PaastaColors.green("Healthy")
            count_str = PaastaColors.green("(%d/%d)" %
                                           (count, normal_instance_count))
        elif count == 0:
            status = PaastaColors.red("Critical")
            count_str = PaastaColors.red("(%d/%d)" %
                                         (count, normal_instance_count))
        else:
            status = PaastaColors.yellow("Warning")
            count_str = PaastaColors.yellow("(%d/%d)" %
                                            (count, normal_instance_count))
        running_string = PaastaColors.bold('TASK_RUNNING')
        output = f"Mesos:      {status} - {count_str} tasks in the {running_string} state."
    except ReadTimeout:
        return "Error: talking to Mesos timed out. It may be overloaded."

    if verbose > 0:
        tail_lines = calculate_tail_lines(verbose_level=verbose)
        output += '\n' + status_mesos_tasks_verbose(
            filter_string=filter_string,
            get_short_task_id=get_short_task_id,
            tail_lines=tail_lines,
        )

    return output
Ejemplo n.º 29
0
def resources_utilization(request):
    master = get_mesos_master()
    mesos_state = block(master.state)

    groupings = request.swagger_data.get('groupings', ['superregion'])
    # swagger actually makes the key None if it's not set
    if groupings is None:
        groupings = ['superregion']
    grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings)
    sorting_function = metastatus_lib.sort_func_for_attributes(groupings)

    filters = request.swagger_data.get('filter', [])
    filters = parse_filters(filters)
    filter_funcs = [
        metastatus_lib.make_filter_slave_func(attr, vals)
        for attr, vals in filters.items()
    ]

    resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_func=grouping_function,
        mesos_state=mesos_state,
        filters=filter_funcs,
        sort_func=sorting_function,
    )

    response_body = []
    for k, v in resource_info_dict.items():
        group = {'groupings': {}}
        for grouping, value in k:
            group['groupings'][grouping] = value
        for resource, value in v['total']._asdict().items():
            group[resource] = {'total': value}
        for resource, value in v['free']._asdict().items():
            group[resource]['free'] = value
        for resource in v['free']._fields:
            group[resource][
                'used'] = group[resource]['total'] - group[resource]['free']

        response_body.append(group)

    return Response(json_body=response_body, status_code=200)
Ejemplo n.º 30
0
def main():
    args = parse_args()

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()

    clients = marathon_tools.get_marathon_clients(
        marathon_tools.get_marathon_servers(system_paasta_config))
    all_clients = clients.get_all_clients()
    all_tasks = []
    for client in all_clients:
        all_tasks.extend(client.list_tasks())
    mesos_slaves = a_sync.block(get_slaves)
    smartstack_replication_checker = MesosSmartstackReplicationChecker(
        mesos_slaves, system_paasta_config)

    for service in list_services(soa_dir=args.soa_dir):
        service_config = PaastaServiceConfigLoader(service=service,
                                                   soa_dir=args.soa_dir)
        for instance_config in service_config.instance_configs(
                cluster=cluster,
                instance_type_class=marathon_tools.MarathonServiceConfig,
        ):
            if instance_config.get_docker_image():
                check_service_replication(
                    instance_config=instance_config,
                    all_tasks=all_tasks,
                    smartstack_replication_checker=
                    smartstack_replication_checker,
                )
            else:
                log.debug(
                    '%s is not deployed. Skipping replication monitoring.' %
                    instance_config.job_id, )