def test_metrics_exporter_reboot(): def check_monitoring_chi(operator_namespace, operator_pod, expect_result, max_retries=10): with And(f"metrics-exporter /chi enpoint result should return {expect_result}"): for i in range(1, max_retries): # check /metrics for try to refresh monitored instances kubectl.launch( f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics", ns=operator_namespace ) # check /chi after refresh monitored instances out = kubectl.launch( f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/chi", ns=operator_namespace ) out = json.loads(out) if out == expect_result: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert out == expect_result, error() with Given("clickhouse-operator is installed"): kubectl.wait_field("pods", "-l app=clickhouse-operator", ".status.containerStatuses[*].ready", "true,true", ns=settings.operator_namespace) assert kubectl.get_count("pod", ns='--all-namespaces', label="-l app=clickhouse-operator") > 0, error() out = kubectl.launch("get pods -l app=clickhouse-operator", ns=settings.operator_namespace).splitlines()[1] operator_pod = re.split(r'[\t\r\n\s]+', out)[0] operator_namespace = settings.operator_namespace kubectl.delete_ns(kubectl.namespace) kubectl.create_ns(kubectl.namespace) check_monitoring_chi(operator_namespace, operator_pod, []) with And("created simple clickhouse installation"): config = util.get_full_path("../docs/chi-examples/01-simple-layout-01-1shard-1repl.yaml") kubectl.create_and_check( config=config, check={ "object_counts": { "statefulset": 1, "pod": 1, "service": 2, }, "do_not_delete": True, }) expected_chi = [{ "namespace": "test", "name": "simple-01", "hostnames": ["chi-simple-01-cluster-0-0.test.svc.cluster.local"] }] check_monitoring_chi(operator_namespace, operator_pod, expected_chi) with When("reboot metrics exporter"): kubectl.launch(f"exec -n {operator_namespace} {operator_pod} -c metrics-exporter -- reboot") time.sleep(15) kubectl.wait_field("pods", "-l app=clickhouse-operator", ".status.containerStatuses[*].ready", "true,true", ns=settings.operator_namespace) with Then("check metrics exporter still contains chi objects"): check_monitoring_chi(operator_namespace, operator_pod, expected_chi) kubectl.delete(config) check_monitoring_chi(operator_namespace, operator_pod, [])
def test_backup_not_run(self): not_run_pod, _, _, _ = alerts.random_pod_choice_for_callbacks(chi) apply_fake_backup("prepare fake backup for time metric") with Then(f"wait {not_run_pod} ready"): kubectl.wait_field("pod", not_run_pod, ".spec.containers[1].image", "nginx:latest") kubectl.wait_field("pod", not_run_pod, ".status.containerStatuses[1].ready", "true") with Then(f"setup {not_run_pod} backup create end time"): kubectl.launch( f'exec {not_run_pod} -c clickhouse-backup -- bash -xc \'' 'echo "# HELP clickhouse_backup_last_create_finish Last backup create finish timestamp" > /usr/share/nginx/html/metrics && ' 'echo "# TYPE clickhouse_backup_last_create_finish gauge" >> /usr/share/nginx/html/metrics && ' f'echo "clickhouse_backup_last_create_finish {int((datetime.datetime.now() - datetime.timedelta(days=2)).timestamp())}" >> /usr/share/nginx/html/metrics ' '\'') fired = alerts.wait_alert_state( "ClickhouseBackupDoesntRunTooLong", "firing", expected_state=True, sleep_time=settings.prometheus_scrape_interval, labels={"pod_name": not_run_pod}, time_range='60s') assert fired, error( "can't get ClickhouseBackupDoesntRunTooLong alert in firing state") apply_normal_backup() backup_name = prepare_table_for_backup(not_run_pod) wait_backup_pod_ready_and_curl_installed(not_run_pod) with When('Backup is success'): exec_on_backup_container( not_run_pod, f'curl -X POST -sL "http://127.0.0.1:7171/backup/create?name={backup_name}"' ) wait_backup_command_status(not_run_pod, f'create {backup_name}', expected_status='success') exec_on_backup_container( not_run_pod, f'curl -X POST -sL "http://127.0.0.1:7171/backup/upload/{backup_name}"' ) wait_backup_command_status(not_run_pod, f'upload {backup_name}', expected_status='success') with Then("check ClickhouseBackupDoesntRunTooLong gone away"): resolved = alerts.wait_alert_state("ClickhouseBackupDoesntRunTooLong", "firing", expected_state=False, labels={"pod_name": not_run_pod}) assert resolved, error( "can't get ClickhouseBackupDoesntRunTooLong alert is gone away")
def test_022(config="configs/test-022-broken-image.yaml"): chi = manifest.get_chi_name(util.get_full_path(config)) kubectl.create_and_check(config=config, check={ "pod_count": 1, "do_not_delete": 1, "chi_status": "InProgress", }) with When("ClickHouse image can not be retrieved"): kubectl.wait_field( "pod", "chi-test-022-broken-image-default-0-0-0", ".status.containerStatuses[0].state.waiting.reason", "ErrImagePull") kubectl.delete_chi(chi)
def wait_backup_pod_ready_and_curl_installed(backup_pod): with Then(f"wait {backup_pod} ready"): kubectl.wait_field("pod", backup_pod, ".status.containerStatuses[1].ready", "true") kubectl.launch( f'exec {backup_pod} -c clickhouse-backup -- curl --version')
def test_backup_duration(self): short_pod, _, long_pod, _ = alerts.random_pod_choice_for_callbacks(chi) apply_fake_backup("prepare fake backup duration metric") for pod in [short_pod, long_pod]: with Then(f"wait {pod} ready"): kubectl.wait_field("pod", pod, ".spec.containers[1].image", "nginx:latest") kubectl.wait_field("pod", pod, ".status.containerStatuses[1].ready", "true") fired = alerts.wait_alert_state( "ClickHouseBackupTooLong", "firing", expected_state=True, sleep_time=settings.prometheus_scrape_interval, labels={"pod_name": pod}, time_range='60s') assert fired, error( f"can't get ClickHouseBackupTooLong alert in firing state for {pod}" ) with Then(f"wait when prometheus will scrape fake data"): time.sleep(70) with Then(f"decrease {short_pod} backup duration"): kubectl.launch( f'exec {short_pod} -c clickhouse-backup -- bash -xc \'' 'echo "# HELP clickhouse_backup_last_create_duration Backup create duration in nanoseconds" > /usr/share/nginx/html/metrics && ' 'echo "# TYPE clickhouse_backup_last_create_duration gauge" >> /usr/share/nginx/html/metrics && ' 'echo "clickhouse_backup_last_create_duration 7000000000000" >> /usr/share/nginx/html/metrics && ' 'echo "# HELP clickhouse_backup_last_create_status Last backup create status: 0=failed, 1=success, 2=unknown" >> /usr/share/nginx/html/metrics && ' 'echo "# TYPE clickhouse_backup_last_create_status gauge" >> /usr/share/nginx/html/metrics && ' 'echo "clickhouse_backup_last_create_status 1" >> /usr/share/nginx/html/metrics' '\'') fired = alerts.wait_alert_state( "ClickHouseBackupTooShort", "firing", expected_state=True, sleep_time=settings.prometheus_scrape_interval, labels={"pod_name": short_pod}, time_range='60s') assert fired, error( "can't get ClickHouseBackupTooShort alert in firing state") apply_normal_backup() with Then("check ClickHouseBackupTooShort gone away"): resolved = alerts.wait_alert_state("ClickHouseBackupTooShort", "firing", expected_state=False, labels={"pod_name": short_pod}) assert resolved, error( "can't get ClickHouseBackupTooShort alert is gone away") with Then("check ClickHouseBackupTooLong gone away"): resolved = alerts.wait_alert_state("ClickHouseBackupTooLong", "firing", expected_state=False, labels={"pod_name": long_pod}) assert resolved, error( "can't get ClickHouseBackupTooLong alert is gone away")