def check_flink_service_health( instance_config: FlinkDeploymentConfig, all_pods: Sequence[V1Pod], smartstack_replication_checker: KubeSmartstackReplicationChecker, ) -> None: si_pods = filter_pods_by_service_instance( pod_list=all_pods, service=instance_config.service, instance=instance_config.instance, ) taskmanagers_expected_cnt = instance_config.config_dict.get( "taskmanager", { "instances": 10 }).get("instances", 10) num_healthy_supervisors = healthy_flink_containers_cnt( si_pods, "supervisor") num_healthy_jobmanagers = healthy_flink_containers_cnt( si_pods, "jobmanager") num_healthy_taskmanagers = healthy_flink_containers_cnt( si_pods, "taskmanager") strerror = None reported_taskmanagers = None try: overview = flink_tools.get_flink_jobmanager_overview( instance_config.service, instance_config.instance, instance_config.cluster) reported_taskmanagers = overview.get("taskmanagers", 0) except ValueError as e: strerror = str(e) send_event_if_not_enough_taskmanagers( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_reported=reported_taskmanagers, strerror=strerror, ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_supervisors, sub_component="supervisor", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=1, num_available=num_healthy_jobmanagers, sub_component="jobmanager", ) monitoring_tools.send_replication_event_if_under_replication( instance_config=instance_config, expected_count=taskmanagers_expected_cnt, num_available=num_healthy_taskmanagers, sub_component="taskmanager", )
def check_under_registered_taskmanagers( instance_config: FlinkDeploymentConfig, expected_count: int, cr_name: str, ) -> Tuple[bool, str, str]: """Check if not enough taskmanagers have been registered to the jobmanager and returns both the result of the check in the form of a boolean and a human-readable text to be used in logging or monitoring events. """ unhealthy = True if cr_name != "": try: overview = flink_tools.get_flink_jobmanager_overview( cr_name, instance_config.cluster ) num_reported = overview.get("taskmanagers", 0) crit_threshold = instance_config.get_replication_crit_percentage() output = ( f"{instance_config.job_id} has {num_reported}/{expected_count} " f"taskmanagers reported by dashboard (threshold: {crit_threshold}%)" ) unhealthy, _ = is_under_replicated( num_reported, expected_count, crit_threshold ) except ValueError as e: output = ( f"Dashboard of service {instance_config.job_id} is not available ({e})" ) else: output = f"Dashboard of service {instance_config.job_id} is not available" if unhealthy: description = f""" This alert means that the Flink dashboard is not reporting the expected number of taskmanagers. Reasons this might be happening: The service may simply be unhealthy. There also may not be enough resources in the cluster to support the requested instance count. Things you can do: * Fix the cause of the unhealthy service. Try running: paasta status -s {instance_config.service} -i {instance_config.instance} -c {instance_config.cluster} -vv """ else: description = f"{instance_config.job_id} taskmanager is available" return unhealthy, output, description
def test_get_flink_jobmanager_overview(): with mock.patch( "paasta_tools.flink_tools._dashboard_get", autospec=True, return_value= '{"taskmanagers":10,"slots-total":10,"flink-version":"1.6.4","flink-commit":"6241481"}', ) as mock_dashboard_get: cluster = "mycluster" cr_name = "kurupt--fm-7c7b459d59" overview = flink_tools.get_flink_jobmanager_overview(cr_name, cluster) mock_dashboard_get.assert_called_once_with(cr_name=cr_name, cluster=cluster, path="overview") assert overview == { "taskmanagers": 10, "slots-total": 10, "flink-version": "1.6.4", "flink-commit": "6241481", }
def test_get_flink_jobmanager_overview(): with mock.patch( "paasta_tools.flink_tools._dashboard_get", autospec=True, return_value= '{"taskmanagers":10,"slots-total":10,"flink-version":"1.6.4","flink-commit":"6241481"}', ) as mock_dashboard_get: cluster = "mycluster" service = "kurupt_fm" instance = "radio_station" overview = flink_tools.get_flink_jobmanager_overview( service, instance, cluster) mock_dashboard_get.assert_called_once_with(service=service, instance=instance, cluster=cluster, path="overview") assert overview == { "taskmanagers": 10, "slots-total": 10, "flink-version": "1.6.4", "flink-commit": "6241481", }