Esempio n. 1
0
def test_send_replication_event_if_under_replication_critical(instance_config):
    with mock.patch(
            'paasta_tools.monitoring_tools.send_replication_event',
            autospec=True,
    ) as mock_send_event:
        monitoring_tools.send_replication_event_if_under_replication(
            instance_config=instance_config,
            expected_count=100,
            num_available=89,
        )
        mock_send_event.assert_called_once_with(
            instance_config=instance_config,
            status=2,
            output=mock.ANY,
        )
        _, send_event_kwargs = mock_send_event.call_args
        alert_output = send_event_kwargs["output"]
        assert (
            "{} has 89 out of 100 expected instances available!\n(threshold: 90%)"
            .format(instance_config.job_id)) in alert_output
        assert ("paasta status -s {} -i {} -c {} -vv".format(
            instance_config.service,
            instance_config.instance,
            instance_config.cluster,
        )) in alert_output
Esempio n. 2
0
def test_send_replication_event_if_under_replication_handles_0_expected(
    instance_config,
):
    with mock.patch(
        "paasta_tools.monitoring_tools.send_replication_event", autospec=True
    ) as mock_send_event:
        monitoring_tools.send_replication_event_if_under_replication(
            instance_config=instance_config,
            expected_count=0,
            num_available=0,
            dry_run=True,
        )
        mock_send_event.assert_called_once_with(
            instance_config=instance_config,
            status=0,
            output=mock.ANY,
            description=mock.ANY,
            dry_run=True,
        )
        _, send_event_kwargs = mock_send_event.call_args
        alert_output = send_event_kwargs["output"]
        assert (
            "{} has 0/0 replicas available (threshold: 90%)".format(
                instance_config.job_id
            )
        ) in alert_output
Esempio n. 3
0
def check_healthy_marathon_tasks_for_service_instance(instance_config,
                                                      expected_count,
                                                      all_tasks):
    app_id = format_job_id(instance_config.service, instance_config.instance)
    num_healthy_tasks = filter_healthy_marathon_instances_for_short_app_id(
        all_tasks=all_tasks, app_id=app_id)
    log.info("Checking %s in marathon as it is not in smartstack" % app_id)
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=expected_count,
        num_available=num_healthy_tasks,
    )
def test_send_replication_event_if_under_replication_good(instance_config):
    with mock.patch("paasta_tools.monitoring_tools.send_replication_event",
                    autospec=True) as mock_send_event:
        monitoring_tools.send_replication_event_if_under_replication(
            instance_config=instance_config,
            expected_count=100,
            num_available=100)
        mock_send_event.assert_called_once_with(
            instance_config=instance_config, status=0, output=mock.ANY)
        _, send_event_kwargs = mock_send_event.call_args
        alert_output = send_event_kwargs["output"]
        assert (
            "{} has 100 out of 100 expected instances available!\n(threshold: 90%)"
            .format(instance_config.job_id)) in alert_output
Esempio n. 5
0
def check_healthy_kubernetes_tasks_for_service_instance(
    instance_config: KubernetesDeploymentConfig,
    expected_count: int,
    all_pods: Sequence[V1Pod],
) -> None:
    si_pods = filter_pods_by_service_instance(
        pod_list=all_pods,
        service=instance_config.service,
        instance=instance_config.instance,
    )
    num_healthy_tasks = len([pod for pod in si_pods if is_pod_ready(pod)])
    log.info(
        f"Checking {instance_config.service}.{instance_config.instance} in kubernetes as it is not in smartstack"
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=expected_count,
        num_available=num_healthy_tasks,
    )
def check_flink_service_health(
    instance_config: FlinkDeploymentConfig,
    all_pods: Sequence[V1Pod],
    smartstack_replication_checker: KubeSmartstackReplicationChecker,
) -> None:
    si_pods = filter_pods_by_service_instance(
        pod_list=all_pods,
        service=instance_config.service,
        instance=instance_config.instance,
    )
    taskmanagers_expected_cnt = instance_config.config_dict.get(
        "taskmanager", {
            "instances": 10
        }).get("instances", 10)
    num_healthy_supervisors = healthy_flink_containers_cnt(
        si_pods, "supervisor")
    num_healthy_jobmanagers = healthy_flink_containers_cnt(
        si_pods, "jobmanager")
    num_healthy_taskmanagers = healthy_flink_containers_cnt(
        si_pods, "taskmanager")

    strerror = None
    reported_taskmanagers = None
    try:
        overview = flink_tools.get_flink_jobmanager_overview(
            instance_config.service, instance_config.instance,
            instance_config.cluster)
        reported_taskmanagers = overview.get("taskmanagers", 0)
    except ValueError as e:
        strerror = str(e)

    send_event_if_not_enough_taskmanagers(
        instance_config=instance_config,
        expected_count=taskmanagers_expected_cnt,
        num_reported=reported_taskmanagers,
        strerror=strerror,
    )

    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_supervisors,
        sub_component="supervisor",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_jobmanagers,
        sub_component="jobmanager",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=taskmanagers_expected_cnt,
        num_available=num_healthy_taskmanagers,
        sub_component="taskmanager",
    )
Esempio n. 7
0
def check_flink_service_replication(
    instance_config: FlinkDeploymentConfig,
    all_pods: Sequence[V1Pod],
    smartstack_replication_checker: KubeSmartstackReplicationChecker,
) -> None:
    si_pods = filter_pods_by_service_instance(
        pod_list=all_pods,
        service=instance_config.service,
        instance=instance_config.instance,
    )
    taskmanagers_expected_cnt = instance_config.config_dict.get(
        "taskmanager", {"instances": 10}
    ).get("instances", 10)
    num_healthy_supervisors = healthy_flink_containers_cnt(si_pods, "supervisor")
    num_healthy_jobmanagers = healthy_flink_containers_cnt(si_pods, "jobmanager")
    num_healthy_taskmanagers = healthy_flink_containers_cnt(si_pods, "taskmanager")

    # TBD: check cnt according to Flink

    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_supervisors,
        sub_component="supervisor",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=1,
        num_available=num_healthy_jobmanagers,
        sub_component="jobmanager",
    )
    monitoring_tools.send_replication_event_if_under_replication(
        instance_config=instance_config,
        expected_count=taskmanagers_expected_cnt,
        num_available=num_healthy_taskmanagers,
        sub_component="taskmanager",
    )