コード例 #1
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir)
            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks('')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service, config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            log.info("Inspecting %s for autoscaling" % job_id)
                            marathon_tasks = {task.id: task for task in all_marathon_tasks
                                              if job_id == get_short_job_id(task.id) and
                                              (is_task_healthy(task) or not
                                               marathon_client.get_app(task.app_id).health_checks)}
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
                            mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning("Skipping autoscaling run for services because the lock is held")
        pass
コード例 #2
0
ファイル: autoscaling_lib.py プロジェクト: fnzv/paasta
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \
                        and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks(
                    '')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service,
                                                   config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            marathon_tasks = {
                                task.id: task
                                for task in all_marathon_tasks
                                if job_id == get_short_job_id(task.id) and
                                (is_task_healthy(task) or not marathon_client.
                                 get_app(task.app_id).health_checks)
                            }
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError(
                                    "Couldn't find any healthy marathon tasks")
                            mesos_tasks = [
                                task for task in all_mesos_tasks
                                if task['id'] in marathon_tasks
                            ]
                            autoscale_marathon_instance(
                                config, list(marathon_tasks.values()),
                                mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config,
                                         line='Caught Exception %s' % e)
    except LockHeldException:
        pass
コード例 #3
0
def filter_autoscaling_tasks(marathon_apps, all_mesos_tasks, config):
    job_id_prefix = "%s%s" % (format_job_id(
        service=config.service, instance=config.instance), MESOS_TASK_SPACER)

    # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
    # are healthy. We assume tasks with no healthcheck results but a defined
    # healthcheck to be unhealthy (unless they are "old" in which case we
    # assume that marathon has screwed up and stopped healthchecking but that
    # they are healthy
    log.info("Inspecting %s for autoscaling" % job_id_prefix)
    marathon_tasks = {}
    for app in marathon_apps:
        for task in app.tasks:
            if task.id.startswith(job_id_prefix) and (
                    is_task_healthy(task) or not app.health_checks
                    or is_old_task_missing_healthchecks(task, app)):
                marathon_tasks[task.id] = task

    if not marathon_tasks:
        raise MetricsProviderNoDataError(
            "Couldn't find any healthy marathon tasks")
    mesos_tasks = [
        task for task in all_mesos_tasks if task['id'] in marathon_tasks
    ]
    return (marathon_tasks, mesos_tasks)
コード例 #4
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster, soa_dir=soa_dir)
            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks('')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service, config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            log.info("Inspecting %s for autoscaling" % job_id)
                            marathon_tasks = {task.id: task for task in all_marathon_tasks
                                              if job_id == get_short_job_id(task.id) and
                                              (is_task_healthy(task) or not
                                               marathon_client.get_app(task.app_id).health_checks)}
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
                            mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning("Skipping autoscaling run for services because the lock is held")
        pass
コード例 #5
0
def get_happy_tasks(
    app: MarathonApp,
    service: str,
    nerve_ns: str,
    system_paasta_config: SystemPaastaConfig,
    min_task_uptime: Optional[float] = None,
    check_haproxy: bool = False,
    haproxy_min_fraction_up: float = 1.0,
) -> List[MarathonTask]:
    """Given a MarathonApp object, return the subset of tasks which are considered healthy.
    With the default options, this returns tasks where at least one of the defined Marathon healthchecks passes.
    For it to do anything interesting, set min_task_uptime or check_haproxy.

    :param app: A MarathonApp object.
    :param service: The name of the service.
    :param nerve_ns: The nerve namespace
    :param min_task_uptime: Minimum number of seconds that a task must be running before we consider it healthy. Useful
                            if tasks take a while to start up.
    :param check_haproxy: Whether to check the local haproxy to make sure this task has been registered and discovered.
    """
    tasks = app.tasks
    happy = []
    now = datetime.datetime.now(datetime.timezone.utc)

    for task in tasks:
        if task.started_at is None:
            # Can't be healthy if it hasn't started
            continue

        if min_task_uptime is not None:
            if (now - task.started_at).total_seconds() < min_task_uptime:
                continue

        # if there are healthchecks defined for the app but none have executed yet, then task is unhappy
        # BUT if the task is "old" and Marathon forgot about its healthcheck due to a leader election,
        # treat it as happy
        if (len(app.health_checks) > 0 and len(task.health_check_results) == 0
                and not marathon_tools.is_old_task_missing_healthchecks(
                    task, app)):
            continue

        # if there are health check results, check if at least one healthcheck is passing
        if not marathon_tools.is_task_healthy(
                task, require_all=False, default_healthy=True):
            continue

        happy.append(task)

    if check_haproxy:
        return filter_tasks_in_smartstack(
            happy,
            service,
            nerve_ns,
            system_paasta_config,
            haproxy_min_fraction_up=haproxy_min_fraction_up,
        )
    else:
        return happy
コード例 #6
0
ファイル: bounce_lib.py プロジェクト: Yelp/paasta
def get_happy_tasks(app, service, nerve_ns, system_paasta_config, min_task_uptime=None, check_haproxy=False):
    """Given a MarathonApp object, return the subset of tasks which are considered healthy.
    With the default options, this returns tasks where at least one of the defined Marathon healthchecks passes.
    For it to do anything interesting, set min_task_uptime or check_haproxy.

    :param app: A MarathonApp object.
    :param service: The name of the service.
    :param nerve_ns: The nerve namespace
    :param min_task_uptime: Minimum number of seconds that a task must be running before we consider it healthy. Useful
                            if tasks take a while to start up.
    :param check_haproxy: Whether to check the local haproxy to make sure this task has been registered and discovered.
    """
    tasks = app.tasks
    happy = []
    now = datetime.datetime.utcnow()

    if check_haproxy:
        tasks_in_smartstack = []
        service_namespace = compose_job_id(service, nerve_ns)

        service_namespace_config = marathon_tools.load_service_namespace_config(
            service=service, namespace=nerve_ns)
        discover_location_type = service_namespace_config.get_discover()
        unique_values = mesos_tools.get_mesos_slaves_grouped_by_attribute(
            slaves=mesos_tools.get_slaves(),
            attribute=discover_location_type
        )

        for value, hosts in unique_values.iteritems():
            synapse_hostname = hosts[0]['hostname']
            tasks_in_smartstack.extend(get_registered_marathon_tasks(
                synapse_hostname,
                system_paasta_config.get_synapse_port(),
                system_paasta_config.get_synapse_haproxy_url_format(),
                service_namespace,
                tasks,
            ))
        tasks = tasks_in_smartstack

    for task in tasks:
        if task.started_at is None:
            # Can't be healthy if it hasn't started
            continue

        if min_task_uptime is not None:
            if (now - task.started_at).total_seconds() < min_task_uptime:
                continue

        # if there are healthchecks defined for the app but none have executed yet, then task is unhappy
        if len(app.health_checks) > 0 and len(task.health_check_results) == 0:
            continue

        # if there are health check results, check if at least one healthcheck is passing
        if not marathon_tools.is_task_healthy(task, require_all=False, default_healthy=True):
            continue
        happy.append(task)

    return happy
コード例 #7
0
def get_verbose_status_of_marathon_app(marathon_client, app, service, instance,
                                       cluster, soa_dir):
    """Takes a given marathon app object and returns the verbose details
    about the tasks, times, hosts, etc"""
    output = []
    create_datetime = datetime_from_utc_to_local(
        isodate.parse_datetime(app.version))
    output.append("  Marathon app ID: %s" % PaastaColors.bold(app.id))
    output.append(
        "    App created: %s (%s)" %
        (str(create_datetime), humanize.naturaltime(create_datetime)))

    autoscaling_info = get_autoscaling_info(marathon_client, service, instance,
                                            cluster, soa_dir)
    if autoscaling_info:
        output.append("    Autoscaling Info:")
        headers = [
            field.replace("_", " ").capitalize()
            for field in ServiceAutoscalingInfo._fields
        ]
        table = [headers, autoscaling_info]
        output.append('\n'.join(
            ["      %s" % line for line in format_table(table)]))

    output.append("    Tasks:")
    rows = [("Mesos Task ID", "Host deployed to", "Deployed at what localtime",
             "Health")]
    for task in app.tasks:
        local_deployed_datetime = datetime_from_utc_to_local(task.staged_at)
        if task.host is not None:
            hostname = "%s:%s" % (task.host.split(".")[0], task.ports[0])
        else:
            hostname = "Unknown"
        if not task.health_check_results:
            health_check_status = PaastaColors.grey("N/A")
        elif marathon_tools.is_task_healthy(task):
            health_check_status = PaastaColors.green("Healthy")
        else:
            health_check_status = PaastaColors.red("Unhealthy")

        rows.append((
            get_short_task_id(task.id),
            hostname,
            '%s (%s)' % (
                local_deployed_datetime.strftime("%Y-%m-%dT%H:%M"),
                humanize.naturaltime(local_deployed_datetime),
            ),
            health_check_status,
        ))
    output.append('\n'.join(["      %s" % line
                             for line in format_table(rows)]))
    if len(app.tasks) == 0:
        output.append("      No tasks associated with this marathon app")
    return app.tasks, "\n".join(output)
コード例 #8
0
def filter_healthy_marathon_instances_for_short_app_id(all_tasks, app_id):
    tasks_for_app = [task for task in all_tasks if task.app_id.startswith('/%s' % app_id)]
    one_minute_ago = datetime.now() - timedelta(minutes=1)

    healthy_tasks = []
    for task in tasks_for_app:
        if marathon_tools.is_task_healthy(task, default_healthy=True) \
                and task.started_at is not None \
                and datetime_from_utc_to_local(task.started_at) < one_minute_ago:
            healthy_tasks.append(task)
    return len(healthy_tasks)
コード例 #9
0
def build_marathon_task_dict(marathon_task: MarathonTask) -> MutableMapping[str, Any]:
    task_dict = {
        "id": get_short_task_id(marathon_task.id),
        "host": marathon_task.host.split(".")[0],
        "port": marathon_task.ports[0],
        "deployed_timestamp": marathon_task.staged_at.timestamp(),
    }

    if marathon_task.health_check_results:
        task_dict["is_healthy"] = marathon_tools.is_task_healthy(marathon_task)

    return task_dict
コード例 #10
0
def get_healthy_marathon_instances_for_short_app_id(client, app_id):
    tasks = client.list_tasks()
    tasks_for_app = [task for task in tasks if task.app_id.startswith('/%s' % app_id)]

    one_minute_ago = datetime.now() - timedelta(minutes=1)

    healthy_tasks = []
    for task in tasks_for_app:
        if marathon_tools.is_task_healthy(task, default_healthy=True) \
                and task.started_at is not None \
                and datetime_from_utc_to_local(task.started_at) < one_minute_ago:
            healthy_tasks.append(task)
    return len(healthy_tasks)
def filter_healthy_marathon_instances_for_short_app_id(all_tasks, app_id):
    tasks_for_app = [
        task for task in all_tasks if task.app_id.startswith("/%s" % app_id)
    ]
    one_minute_ago = datetime.now(timezone.utc) - timedelta(minutes=1)

    healthy_tasks = []
    for task in tasks_for_app:
        if (marathon_tools.is_task_healthy(task, default_healthy=True)
                and task.started_at is not None
                and task.started_at < one_minute_ago):
            healthy_tasks.append(task)
    return len(healthy_tasks)
コード例 #12
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            configs = get_configs_of_services_to_scale(cluster=cluster,
                                                       soa_dir=soa_dir)
            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_all_running_tasks()
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = config.format_marathon_app_dict()['id']
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy (unless they are "old" in which case we
                            # assume that marathon has screwed up and stopped healthchecking but that
                            # they are healthy
                            log.info("Inspecting %s for autoscaling" % job_id)
                            marathon_tasks = {
                                task.id: task
                                for task in all_marathon_tasks
                                if task.id.startswith(job_id) and
                                (is_task_healthy(task) or not marathon_client.
                                 get_app(task.app_id).health_checks
                                 or is_old_task_missing_healthchecks(
                                     task, marathon_client))
                            }
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError(
                                    "Couldn't find any healthy marathon tasks")
                            mesos_tasks = [
                                task for task in all_mesos_tasks
                                if task['id'] in marathon_tasks
                            ]
                            autoscale_marathon_instance(
                                config, list(marathon_tasks.values()),
                                mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config,
                                         line='Caught Exception %s' % e)
    except LockHeldException:
        log.warning(
            "Skipping autoscaling run for services because the lock is held")
コード例 #13
0
ファイル: autoscaling_lib.py プロジェクト: EvanKrall/paasta
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \
                        and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks('')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service, config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            marathon_tasks = {task.id: task for task in all_marathon_tasks
                                              if job_id == get_short_job_id(task.id) and
                                              (is_task_healthy(task) or not
                                               marathon_client.get_app(task.app_id).health_checks)}
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
                            mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        pass
コード例 #14
0
def get_happy_tasks(app,
                    service,
                    nerve_ns,
                    system_paasta_config,
                    min_task_uptime=None,
                    check_haproxy=False):
    """Given a MarathonApp object, return the subset of tasks which are considered healthy.
    With the default options, this returns tasks where at least one of the defined Marathon healthchecks passes.
    For it to do anything interesting, set min_task_uptime or check_haproxy.

    :param app: A MarathonApp object.
    :param service: The name of the service.
    :param nerve_ns: The nerve namespace
    :param min_task_uptime: Minimum number of seconds that a task must be running before we consider it healthy. Useful
                            if tasks take a while to start up.
    :param check_haproxy: Whether to check the local haproxy to make sure this task has been registered and discovered.
    """
    tasks = app.tasks
    happy = []
    now = datetime.datetime.utcnow()

    for task in tasks:
        if task.started_at is None:
            # Can't be healthy if it hasn't started
            continue

        if min_task_uptime is not None:
            if (now - task.started_at).total_seconds() < min_task_uptime:
                continue

        # if there are healthchecks defined for the app but none have executed yet, then task is unhappy
        if len(app.health_checks) > 0 and len(task.health_check_results) == 0:
            continue

        # if there are health check results, check if at least one healthcheck is passing
        if not marathon_tools.is_task_healthy(
                task, require_all=False, default_healthy=True):
            continue

        if check_haproxy:
            if not is_task_in_smartstack(task, service, nerve_ns,
                                         system_paasta_config):
                continue

        happy.append(task)
    return happy
コード例 #15
0
def filter_autoscaling_tasks(marathon_client, all_marathon_tasks, all_mesos_tasks, config):
    job_id = config.format_marathon_app_dict()['id']
    # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
    # are healthy. We assume tasks with no healthcheck results but a defined
    # healthcheck to be unhealthy (unless they are "old" in which case we
    # assume that marathon has screwed up and stopped healthchecking but that
    # they are healthy
    log.info("Inspecting %s for autoscaling" % job_id)
    marathon_tasks = {task.id: task for task in all_marathon_tasks
                      if task.id.startswith(job_id) and
                      (is_task_healthy(task) or not
                       marathon_client.get_app(task.app_id).health_checks or
                       is_old_task_missing_healthchecks(task, marathon_client))}
    if not marathon_tasks:
        raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
    mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
    return (marathon_tasks, mesos_tasks)
コード例 #16
0
def get_verbose_status_of_marathon_app(app):
    """Takes a given marathon app object and returns the verbose details
    about the tasks, times, hosts, etc"""
    output = []
    create_datetime = datetime_from_utc_to_local(isodate.parse_datetime(app.version))
    output.append("  Marathon app ID: %s" % PaastaColors.bold(app.id))
    output.append("    App created: %s (%s)" % (str(create_datetime), humanize.naturaltime(create_datetime)))
    output.append("    Tasks:")

    rows = [("Mesos Task ID", "Host deployed to", "Deployed at what localtime", "Health")]
    for task in app.tasks:
        local_deployed_datetime = datetime_from_utc_to_local(task.staged_at)
        if task.host is not None:
            hostname = "%s:%s" % (task.host.split(".")[0], task.ports[0])
        else:
            hostname = "Unknown"
        if not task.health_check_results:
            health_check_status = PaastaColors.grey("N/A")
        elif marathon_tools.is_task_healthy(task):
            health_check_status = PaastaColors.green("Healthy")
        else:
            health_check_status = PaastaColors.red("Unhealthy")

        rows.append((
            get_short_task_id(task.id),
            hostname,
            '%s (%s)' % (
                local_deployed_datetime.strftime("%Y-%m-%dT%H:%M"),
                humanize.naturaltime(local_deployed_datetime),
            ),
            health_check_status,
        ))
    output.append('\n'.join(["      %s" % line for line in format_table(rows)]))
    if len(app.tasks) == 0:
        output.append("      No tasks associated with this marathon app")
    return app.tasks, "\n".join(output)
コード例 #17
0
def status_marathon_app(
    marathon_client: marathon_tools.MarathonClient,
    app: marathon_tools.MarathonApp,
    service: str,
    instance: str,
    cluster: str,
    soa_dir: str,
    dashboards: Dict[marathon_tools.MarathonClient, str],
    verbose: int,
) -> Tuple[int, int, str]:
    """Takes a given marathon app object and returns the details about start, times, hosts, etc"""
    output = []
    create_datetime = datetime_from_utc_to_local(isodate.parse_datetime(app.version))
    output.append(get_marathon_dashboard(marathon_client, dashboards, app.id))
    output.append(
        "    "
        + " ".join(
            [
                f"{app.tasks_running} running,",
                f"{app.tasks_healthy} healthy,",
                f"{app.tasks_staged} staged",
                f"out of {app.instances}",
            ]
        )
    )
    output.append(
        "    App created: {} ({})".format(
            str(create_datetime), humanize.naturaltime(create_datetime)
        )
    )

    deploy_status = marathon_tools.get_marathon_app_deploy_status(marathon_client, app)
    app_queue = marathon_tools.get_app_queue(marathon_client, app.id)
    unused_offers_summary = marathon_tools.summarize_unused_offers(app_queue)
    if deploy_status == marathon_tools.MarathonDeployStatus.Delayed:
        _, backoff_seconds = marathon_tools.get_app_queue_status_from_queue(app_queue)
        deploy_status_human = marathon_app_deploy_status_human(
            deploy_status, backoff_seconds
        )
    else:
        deploy_status_human = marathon_app_deploy_status_human(deploy_status)
    output.append(f"    Status: {deploy_status_human}")

    if unused_offers_summary is not None and len(unused_offers_summary) > 0:
        output.append("    Possibly stalled for:")
        output.append(
            "      ".join([f"{k}: {n} times" for k, n in unused_offers_summary.items()])
        )

    if verbose > 0:
        output.append("    Tasks:")
        rows = [
            (
                "Mesos Task ID",
                "Host deployed to",
                "Deployed at what localtime",
                "Health",
            )
        ]
        for task in app.tasks:
            local_deployed_datetime = datetime_from_utc_to_local(task.staged_at)
            if task.host is not None:
                hostname = "{}:{}".format(task.host.split(".")[0], task.ports[0])
            else:
                hostname = "Unknown"
            if not task.health_check_results:
                health_check_status = PaastaColors.grey("N/A")
            elif marathon_tools.is_task_healthy(task):
                health_check_status = PaastaColors.green("Healthy")
            else:
                health_check_status = PaastaColors.red("Unhealthy")

            rows.append(
                (
                    get_short_task_id(task.id),
                    hostname,
                    "{} ({})".format(
                        local_deployed_datetime.strftime("%Y-%m-%dT%H:%M"),
                        humanize.naturaltime(local_deployed_datetime),
                    ),
                    health_check_status,
                )
            )
        output.append("\n".join(["      %s" % line for line in format_table(rows)]))
        if len(app.tasks) == 0:
            output.append("      No tasks associated with this marathon app")
    return deploy_status, app.tasks_running, "\n".join(output)
コード例 #18
0
ファイル: bounce_lib.py プロジェクト: edric-shen/paasta
def get_happy_tasks(app,
                    service,
                    nerve_ns,
                    system_paasta_config,
                    min_task_uptime=None,
                    check_haproxy=False):
    """Given a MarathonApp object, return the subset of tasks which are considered healthy.
    With the default options, this returns tasks where at least one of the defined Marathon healthchecks passes.
    For it to do anything interesting, set min_task_uptime or check_haproxy.

    :param app: A MarathonApp object.
    :param service: The name of the service.
    :param nerve_ns: The nerve namespace
    :param min_task_uptime: Minimum number of seconds that a task must be running before we consider it healthy. Useful
                            if tasks take a while to start up.
    :param check_haproxy: Whether to check the local haproxy to make sure this task has been registered and discovered.
    """
    tasks = app.tasks
    happy = []
    now = datetime.datetime.utcnow()

    if check_haproxy:
        tasks_in_smartstack = []
        service_namespace = compose_job_id(service, nerve_ns)

        service_namespace_config = marathon_tools.load_service_namespace_config(
            service=service, namespace=nerve_ns)
        discover_location_type = service_namespace_config.get_discover()
        unique_values = mesos_tools.get_mesos_slaves_grouped_by_attribute(
            slaves=mesos_tools.get_slaves(), attribute=discover_location_type)

        for value, hosts in unique_values.iteritems():
            synapse_hostname = hosts[0]['hostname']
            tasks_in_smartstack.extend(
                get_registered_marathon_tasks(
                    synapse_hostname,
                    system_paasta_config.get_synapse_port(),
                    system_paasta_config.get_synapse_haproxy_url_format(),
                    service_namespace,
                    tasks,
                ))
        tasks = tasks_in_smartstack

    for task in tasks:
        if task.started_at is None:
            # Can't be healthy if it hasn't started
            continue

        if min_task_uptime is not None:
            if (now - task.started_at).total_seconds() < min_task_uptime:
                continue

        # if there are healthchecks defined for the app but none have executed yet, then task is unhappy
        if len(app.health_checks) > 0 and len(task.health_check_results) == 0:
            continue

        # if there are health check results, check if at least one healthcheck is passing
        if not marathon_tools.is_task_healthy(
                task, require_all=False, default_healthy=True):
            continue
        happy.append(task)

    return happy
コード例 #19
0
def filter_autoscaling_tasks(
    marathon_apps: Sequence[MarathonApp],
    all_mesos_tasks: Sequence[Task],
    config: MarathonServiceConfig,
    system_paasta_config: SystemPaastaConfig,
) -> Tuple[Mapping[str, MarathonTask], Sequence[Task]]:
    """Find the tasks that are serving traffic. We care about this because many tasks have a period of high CPU when
    they first start up, during which they warm up code, load and process data, etc., and we don't want this high load
    to drag our overall load estimate upwards. Allowing these tasks to count towards overall load could cause a cycle of
    scaling up, seeing high load due to new warming-up containers, scaling up, until we hit max_instances.

    However, accidentally omitting a task that actually is serving traffic will cause us to underestimate load; this is
    generally much worse than overestimating, since it can cause us to incorrectly scale down or refuse to scale up when
    necessary. For this reason, we look at several sources of health information, and if they disagree, assume the task
    is serving traffic.
    """
    job_id_prefix = "{}{}".format(
        format_job_id(service=config.service, instance=config.instance),
        MESOS_TASK_SPACER,
    )

    # Get a dict of healthy tasks, we assume tasks with no healthcheck defined are healthy.
    # We assume tasks with no healthcheck results but a defined healthcheck to be unhealthy, unless they are "old" in
    # which case we assume that Marathon has screwed up and stopped healthchecking but that they are healthy.

    log.info("Inspecting %s for autoscaling" % job_id_prefix)

    relevant_tasks_by_app: Dict[MarathonApp, List[MarathonTask]] = {
        app: app.tasks
        for app in marathon_apps
        if app.id.lstrip("/").startswith(job_id_prefix)
    }

    healthy_marathon_tasks: Dict[str, MarathonTask] = {}

    for app, tasks in relevant_tasks_by_app.items():
        for task in tasks:
            if (is_task_healthy(task) or not app.health_checks
                    or is_old_task_missing_healthchecks(task, app)):
                healthy_marathon_tasks[task.id] = task

    service_namespace_config = load_service_namespace_config(
        service=config.service, namespace=config.get_nerve_namespace())
    if service_namespace_config.is_in_smartstack():

        for task in filter_tasks_in_smartstack(
                tasks=[
                    task for tasks in relevant_tasks_by_app.values()
                    for task in tasks
                ],
                service=config.service,
                nerve_ns=config.get_nerve_namespace(),
                system_paasta_config=system_paasta_config,
                max_hosts_to_query=20,
                haproxy_min_fraction_up=
                0.01,  # Be very liberal. See docstring above for rationale.
        ):
            healthy_marathon_tasks[task.id] = task

    if not healthy_marathon_tasks:
        raise MetricsProviderNoDataError(
            "Couldn't find any healthy marathon tasks")
    mesos_tasks = [
        task for task in all_mesos_tasks
        if task["id"] in healthy_marathon_tasks
    ]
    return (healthy_marathon_tasks, mesos_tasks)