def test_metrics_exporter_reboot():
    def check_monitoring_chi(operator_namespace, operator_pod, expect_result, max_retries=10):
        with And(f"metrics-exporter /chi enpoint result should return {expect_result}"):
            for i in range(1, max_retries):
                # check /metrics for try to refresh monitored instances
                kubectl.launch(
                    f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics",
                    ns=operator_namespace
                )
                # check /chi after refresh monitored instances
                out = kubectl.launch(
                    f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/chi",
                    ns=operator_namespace
                )
                out = json.loads(out)
                if out == expect_result:
                    break
                with Then("Not ready. Wait for " + str(i * 5) + " seconds"):
                    time.sleep(i * 5)
            assert out == expect_result, error()

    with Given("clickhouse-operator is installed"):
        kubectl.wait_field("pods", "-l app=clickhouse-operator", ".status.containerStatuses[*].ready", "true,true",
                           ns=settings.operator_namespace)
        assert kubectl.get_count("pod", ns='--all-namespaces', label="-l app=clickhouse-operator") > 0, error()

        out = kubectl.launch("get pods -l app=clickhouse-operator", ns=settings.operator_namespace).splitlines()[1]
        operator_pod = re.split(r'[\t\r\n\s]+', out)[0]
        operator_namespace = settings.operator_namespace
        kubectl.delete_ns(kubectl.namespace)
        kubectl.create_ns(kubectl.namespace)
        check_monitoring_chi(operator_namespace, operator_pod, [])
        with And("created simple clickhouse installation"):
            config = util.get_full_path("../docs/chi-examples/01-simple-layout-01-1shard-1repl.yaml")
            kubectl.create_and_check(
                config=config,
                check={
                    "object_counts": {
                        "statefulset": 1,
                        "pod": 1,
                        "service": 2,
                    },
                    "do_not_delete": True,
                })
            expected_chi = [{
                "namespace": "test", "name": "simple-01",
                "hostnames": ["chi-simple-01-cluster-0-0.test.svc.cluster.local"]
            }]
            check_monitoring_chi(operator_namespace, operator_pod, expected_chi)
            with When("reboot metrics exporter"):
                kubectl.launch(f"exec -n {operator_namespace} {operator_pod} -c metrics-exporter -- reboot")
                time.sleep(15)
                kubectl.wait_field("pods", "-l app=clickhouse-operator",
                                        ".status.containerStatuses[*].ready", "true,true",
                                   ns=settings.operator_namespace)
                with Then("check metrics exporter still contains chi objects"):
                    check_monitoring_chi(operator_namespace, operator_pod, expected_chi)
                    kubectl.delete(config)
                    check_monitoring_chi(operator_namespace, operator_pod, [])
Beispiel #2
0
def test_backup_not_run(self):
    not_run_pod, _, _, _ = alerts.random_pod_choice_for_callbacks(chi)
    apply_fake_backup("prepare fake backup for time metric")

    with Then(f"wait {not_run_pod} ready"):
        kubectl.wait_field("pod", not_run_pod, ".spec.containers[1].image",
                           "nginx:latest")
        kubectl.wait_field("pod", not_run_pod,
                           ".status.containerStatuses[1].ready", "true")

    with Then(f"setup {not_run_pod} backup create end time"):
        kubectl.launch(
            f'exec {not_run_pod} -c clickhouse-backup -- bash -xc \''
            'echo "# HELP clickhouse_backup_last_create_finish Last backup create finish timestamp" > /usr/share/nginx/html/metrics && '
            'echo "# TYPE clickhouse_backup_last_create_finish gauge" >> /usr/share/nginx/html/metrics && '
            f'echo "clickhouse_backup_last_create_finish {int((datetime.datetime.now() - datetime.timedelta(days=2)).timestamp())}" >> /usr/share/nginx/html/metrics '
            '\'')

        fired = alerts.wait_alert_state(
            "ClickhouseBackupDoesntRunTooLong",
            "firing",
            expected_state=True,
            sleep_time=settings.prometheus_scrape_interval,
            labels={"pod_name": not_run_pod},
            time_range='60s')
        assert fired, error(
            "can't get ClickhouseBackupDoesntRunTooLong alert in firing state")

    apply_normal_backup()

    backup_name = prepare_table_for_backup(not_run_pod)
    wait_backup_pod_ready_and_curl_installed(not_run_pod)

    with When('Backup is success'):
        exec_on_backup_container(
            not_run_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/create?name={backup_name}"'
        )
        wait_backup_command_status(not_run_pod,
                                   f'create {backup_name}',
                                   expected_status='success')

        exec_on_backup_container(
            not_run_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/upload/{backup_name}"'
        )
        wait_backup_command_status(not_run_pod,
                                   f'upload {backup_name}',
                                   expected_status='success')

    with Then("check ClickhouseBackupDoesntRunTooLong gone away"):
        resolved = alerts.wait_alert_state("ClickhouseBackupDoesntRunTooLong",
                                           "firing",
                                           expected_state=False,
                                           labels={"pod_name": not_run_pod})
        assert resolved, error(
            "can't get ClickhouseBackupDoesntRunTooLong alert is gone away")
Beispiel #3
0
def test_022(config="configs/test-022-broken-image.yaml"):
    chi = manifest.get_chi_name(util.get_full_path(config))
    kubectl.create_and_check(config=config,
                             check={
                                 "pod_count": 1,
                                 "do_not_delete": 1,
                                 "chi_status": "InProgress",
                             })
    with When("ClickHouse image can not be retrieved"):
        kubectl.wait_field(
            "pod", "chi-test-022-broken-image-default-0-0-0",
            ".status.containerStatuses[0].state.waiting.reason",
            "ErrImagePull")
        kubectl.delete_chi(chi)
Beispiel #4
0
def wait_backup_pod_ready_and_curl_installed(backup_pod):
    with Then(f"wait {backup_pod} ready"):
        kubectl.wait_field("pod", backup_pod,
                           ".status.containerStatuses[1].ready", "true")
        kubectl.launch(
            f'exec {backup_pod} -c clickhouse-backup -- curl --version')
Beispiel #5
0
def test_backup_duration(self):
    short_pod, _, long_pod, _ = alerts.random_pod_choice_for_callbacks(chi)
    apply_fake_backup("prepare fake backup duration metric")

    for pod in [short_pod, long_pod]:
        with Then(f"wait {pod} ready"):
            kubectl.wait_field("pod", pod, ".spec.containers[1].image",
                               "nginx:latest")
            kubectl.wait_field("pod", pod,
                               ".status.containerStatuses[1].ready", "true")

            fired = alerts.wait_alert_state(
                "ClickHouseBackupTooLong",
                "firing",
                expected_state=True,
                sleep_time=settings.prometheus_scrape_interval,
                labels={"pod_name": pod},
                time_range='60s')
            assert fired, error(
                f"can't get ClickHouseBackupTooLong alert in firing state for {pod}"
            )

    with Then(f"wait when prometheus will scrape fake data"):
        time.sleep(70)

    with Then(f"decrease {short_pod} backup duration"):
        kubectl.launch(
            f'exec {short_pod} -c clickhouse-backup -- bash -xc \''
            'echo "# HELP clickhouse_backup_last_create_duration Backup create duration in nanoseconds" > /usr/share/nginx/html/metrics && '
            'echo "# TYPE clickhouse_backup_last_create_duration gauge" >> /usr/share/nginx/html/metrics && '
            'echo "clickhouse_backup_last_create_duration 7000000000000" >> /usr/share/nginx/html/metrics && '
            'echo "# HELP clickhouse_backup_last_create_status Last backup create status: 0=failed, 1=success, 2=unknown" >> /usr/share/nginx/html/metrics && '
            'echo "# TYPE clickhouse_backup_last_create_status gauge" >> /usr/share/nginx/html/metrics && '
            'echo "clickhouse_backup_last_create_status 1" >> /usr/share/nginx/html/metrics'
            '\'')

        fired = alerts.wait_alert_state(
            "ClickHouseBackupTooShort",
            "firing",
            expected_state=True,
            sleep_time=settings.prometheus_scrape_interval,
            labels={"pod_name": short_pod},
            time_range='60s')
        assert fired, error(
            "can't get ClickHouseBackupTooShort alert in firing state")

    apply_normal_backup()

    with Then("check ClickHouseBackupTooShort gone away"):
        resolved = alerts.wait_alert_state("ClickHouseBackupTooShort",
                                           "firing",
                                           expected_state=False,
                                           labels={"pod_name": short_pod})
        assert resolved, error(
            "can't get ClickHouseBackupTooShort alert is gone away")

    with Then("check ClickHouseBackupTooLong gone away"):
        resolved = alerts.wait_alert_state("ClickHouseBackupTooLong",
                                           "firing",
                                           expected_state=False,
                                           labels={"pod_name": long_pod})
        assert resolved, error(
            "can't get ClickHouseBackupTooLong alert is gone away")