Beispiel #1
0
def check_flink_service_health(
    instance_config: FlinkDeploymentConfig,
    all_tasks_or_pods: Sequence[V1Pod],
    replication_checker: KubeSmartstackEnvoyReplicationChecker,
    dry_run: bool = False,
) -> None:
    si_pods = filter_pods_by_service_instance(
        pod_list=all_tasks_or_pods,
        service=instance_config.service,
        instance=instance_config.instance,
    )
    taskmanagers_expected_cnt = instance_config.config_dict.get(
        "taskmanager", {"instances": 10}
    ).get("instances", 10)
    num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor")
    num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager")
    num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager")

    service_cr_name = get_cr_name(si_pods)

    results = [
        check_under_replication(
            instance_config=instance_config,
            expected_count=1,
            num_available=num_healthy_supervisors,
            sub_component="supervisor",
        ),
        check_under_replication(
            instance_config=instance_config,
            expected_count=1,
            num_available=num_healthy_jobmanagers,
            sub_component="jobmanager",
        ),
        check_under_replication(
            instance_config=instance_config,
            expected_count=taskmanagers_expected_cnt,
            num_available=num_healthy_taskmanagers,
            sub_component="taskmanager",
        ),
        check_under_registered_taskmanagers(
            instance_config=instance_config,
            expected_count=taskmanagers_expected_cnt,
            cr_name=service_cr_name,
        ),
    ]
    output = ", ".join([r[1] for r in results])
    description = "\n########\n".join([r[2] for r in results])
    if any(r[0] for r in results):
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_replication_event(
        instance_config=instance_config,
        status=status,
        output=output,
        description=description,
        dry_run=dry_run,
    )
def check_flink_service_health(
    instance_config: FlinkDeploymentConfig,
    all_pods: Sequence[V1Pod],
    smartstack_replication_checker: KubeSmartstackReplicationChecker,
) -> None:
    si_pods = filter_pods_by_service_instance(
        pod_list=all_pods,
        service=instance_config.service,
        instance=instance_config.instance,
    )
    taskmanagers_expected_cnt = instance_config.config_dict.get(
        "taskmanager", {
            "instances": 10
        }).get("instances", 10)
    num_healthy_supervisors = healthy_flink_containers_cnt(
        si_pods, "supervisor")
    num_healthy_jobmanagers = healthy_flink_containers_cnt(
        si_pods, "jobmanager")
    num_healthy_taskmanagers = healthy_flink_containers_cnt(
        si_pods, "taskmanager")

    strerror = None
    reported_taskmanagers = None
    try:
        overview = flink_tools.get_flink_jobmanager_overview(
            instance_config.service, instance_config.instance,
            instance_config.cluster)
        reported_taskmanagers = overview.get("taskmanagers", 0)
    except ValueError as e:
        strerror = str(e)

    send_event_if_not_enough_taskmanagers(
        instance_config=instance_config,
        expected_count=taskmanagers_expected_cnt,
        num_reported=reported_taskmanagers,
        strerror=strerror,
    )

    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_supervisors,
        sub_component="supervisor",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_jobmanagers,
        sub_component="jobmanager",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=taskmanagers_expected_cnt,
        num_available=num_healthy_taskmanagers,
        sub_component="taskmanager",
    )
Beispiel #3
0
def check_healthy_kubernetes_tasks_for_service_instance(
    instance_config: KubernetesDeploymentConfig,
    expected_count: int,
    all_pods: Sequence[V1Pod],
) -> None:
    si_pods = filter_pods_by_service_instance(
        pod_list=all_pods,
        service=instance_config.service,
        instance=instance_config.instance,
    )
    num_healthy_tasks = len([pod for pod in si_pods if is_pod_ready(pod)])
    log.info(
        f"Checking {instance_config.service}.{instance_config.instance} in kubernetes as it is not in smartstack"
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=expected_count,
        num_available=num_healthy_tasks,
    )
Beispiel #4
0
def check_flink_service_replication(
    instance_config: FlinkDeploymentConfig,
    all_pods: Sequence[V1Pod],
    smartstack_replication_checker: KubeSmartstackReplicationChecker,
) -> None:
    si_pods = filter_pods_by_service_instance(
        pod_list=all_pods,
        service=instance_config.service,
        instance=instance_config.instance,
    )
    taskmanagers_expected_cnt = instance_config.config_dict.get(
        "taskmanager", {"instances": 10}
    ).get("instances", 10)
    num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor")
    num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager")
    num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager")

    # TBD: check cnt according to Flink

    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_supervisors,
        sub_component="supervisor",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_jobmanagers,
        sub_component="jobmanager",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=taskmanagers_expected_cnt,
        num_available=num_healthy_taskmanagers,
        sub_component="taskmanager",
    )