コード例 #1
0
def check_flink_service_health(
    instance_config: FlinkDeploymentConfig,
    all_pods: Sequence[V1Pod],
    smartstack_replication_checker: KubeSmartstackReplicationChecker,
) -> None:
    si_pods = filter_pods_by_service_instance(
        pod_list=all_pods,
        service=instance_config.service,
        instance=instance_config.instance,
    )
    taskmanagers_expected_cnt = instance_config.config_dict.get(
        "taskmanager", {
            "instances": 10
        }).get("instances", 10)
    num_healthy_supervisors = healthy_flink_containers_cnt(
        si_pods, "supervisor")
    num_healthy_jobmanagers = healthy_flink_containers_cnt(
        si_pods, "jobmanager")
    num_healthy_taskmanagers = healthy_flink_containers_cnt(
        si_pods, "taskmanager")

    strerror = None
    reported_taskmanagers = None
    try:
        overview = flink_tools.get_flink_jobmanager_overview(
            instance_config.service, instance_config.instance,
            instance_config.cluster)
        reported_taskmanagers = overview.get("taskmanagers", 0)
    except ValueError as e:
        strerror = str(e)

    send_event_if_not_enough_taskmanagers(
        instance_config=instance_config,
        expected_count=taskmanagers_expected_cnt,
        num_reported=reported_taskmanagers,
        strerror=strerror,
    )

    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_supervisors,
        sub_component="supervisor",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_jobmanagers,
        sub_component="jobmanager",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=taskmanagers_expected_cnt,
        num_available=num_healthy_taskmanagers,
        sub_component="taskmanager",
    )
コード例 #2
0
def check_under_registered_taskmanagers(
    instance_config: FlinkDeploymentConfig,
    expected_count: int,
    cr_name: str,
) -> Tuple[bool, str, str]:
    """Check if not enough taskmanagers have been registered to the jobmanager and
    returns both the result of the check in the form of a boolean and a human-readable
    text to be used in logging or monitoring events.
    """
    unhealthy = True
    if cr_name != "":
        try:
            overview = flink_tools.get_flink_jobmanager_overview(
                cr_name, instance_config.cluster
            )
            num_reported = overview.get("taskmanagers", 0)
            crit_threshold = instance_config.get_replication_crit_percentage()
            output = (
                f"{instance_config.job_id} has {num_reported}/{expected_count} "
                f"taskmanagers reported by dashboard (threshold: {crit_threshold}%)"
            )
            unhealthy, _ = is_under_replicated(
                num_reported, expected_count, crit_threshold
            )
        except ValueError as e:
            output = (
                f"Dashboard of service {instance_config.job_id} is not available ({e})"
            )
    else:
        output = f"Dashboard of service {instance_config.job_id} is not available"
    if unhealthy:
        description = f"""
This alert means that the Flink dashboard is not reporting the expected
number of taskmanagers.

Reasons this might be happening:

  The service may simply be unhealthy. There also may not be enough resources
  in the cluster to support the requested instance count.

Things you can do:

  * Fix the cause of the unhealthy service. Try running:

     paasta status -s {instance_config.service} -i {instance_config.instance} -c {instance_config.cluster} -vv

"""
    else:
        description = f"{instance_config.job_id} taskmanager is available"
    return unhealthy, output, description
コード例 #3
0
ファイル: test_flink_tools.py プロジェクト: zaitsevlab/paasta
def test_get_flink_jobmanager_overview():
    with mock.patch(
            "paasta_tools.flink_tools._dashboard_get",
            autospec=True,
            return_value=
            '{"taskmanagers":10,"slots-total":10,"flink-version":"1.6.4","flink-commit":"6241481"}',
    ) as mock_dashboard_get:
        cluster = "mycluster"
        cr_name = "kurupt--fm-7c7b459d59"
        overview = flink_tools.get_flink_jobmanager_overview(cr_name, cluster)
        mock_dashboard_get.assert_called_once_with(cr_name=cr_name,
                                                   cluster=cluster,
                                                   path="overview")
        assert overview == {
            "taskmanagers": 10,
            "slots-total": 10,
            "flink-version": "1.6.4",
            "flink-commit": "6241481",
        }
コード例 #4
0
def test_get_flink_jobmanager_overview():
    with mock.patch(
            "paasta_tools.flink_tools._dashboard_get",
            autospec=True,
            return_value=
            '{"taskmanagers":10,"slots-total":10,"flink-version":"1.6.4","flink-commit":"6241481"}',
    ) as mock_dashboard_get:
        cluster = "mycluster"
        service = "kurupt_fm"
        instance = "radio_station"
        overview = flink_tools.get_flink_jobmanager_overview(
            service, instance, cluster)
        mock_dashboard_get.assert_called_once_with(service=service,
                                                   instance=instance,
                                                   cluster=cluster,
                                                   path="overview")
        assert overview == {
            "taskmanagers": 10,
            "slots-total": 10,
            "flink-version": "1.6.4",
            "flink-commit": "6241481",
        }