def test_version_changed(self): changed_pod, changed_svc, _, _ = alerts.random_pod_choice_for_callbacks( chi) with When("apply changed settings"): kubectl.create_and_check( config="configs/test-cluster-for-alerts-changed-settings.yaml", check={ "apply_templates": [ "templates/tpl-clickhouse-stable.yaml", "templates/tpl-persistent-volume-100Mi.yaml" ], "object_counts": { "statefulset": 2, "pod": 2, "service": 3, }, "do_not_delete": 1 }) prometheus_scrape_interval = 15 with Then( f"wait prometheus_scrape_interval={prometheus_scrape_interval}*2 sec" ): time.sleep(prometheus_scrape_interval * 2) with Then("check ClickHouseVersionChanged firing"): fired = alerts.wait_alert_state( "ClickHouseVersionChanged", "firing", True, labels={"hostname": changed_svc}, time_range="30s", sleep_time=settings.prometheus_scrape_interval) assert fired, error( "can't get ClickHouseVersionChanged alert in firing state") with When("rollback changed settings"): kubectl.create_and_check( config="configs/test-cluster-for-alerts.yaml", check={ "apply_templates": [ "templates/tpl-clickhouse-latest.yaml", "templates/tpl-clickhouse-alerts.yaml", "templates/tpl-persistent-volume-100Mi.yaml" ], "object_counts": { "statefulset": 2, "pod": 2, "service": 3, }, "do_not_delete": 1 }) with Then("check ClickHouseVersionChanged gone away"): resolved = alerts.wait_alert_state("ClickHouseVersionChanged", "firing", False, labels={"hostname": changed_svc}, sleep_time=30) assert resolved, error( "can't check ClickHouseVersionChanged alert is gone away")
def test_021(config="configs/test-021-rescale-volume-01.yaml"): with Given("Default storage class is expandable"): default_storage_class = kubectl.get_default_storage_class() assert default_storage_class is not None assert len(default_storage_class) > 0 allow_volume_expansion = kubectl.get_field("storageclass", default_storage_class, ".allowVolumeExpansion") if allow_volume_expansion != "true": kubectl.launch(f"patch storageclass {default_storage_class} -p '{{\"allowVolumeExpansion\":true}}'") chi = manifest.get_chi_name(util.get_full_path(config)) kubectl.create_and_check( config=config, check={ "pod_count": 1, "do_not_delete": 1, }) with Then("Storage size should be 100Mi"): size = kubectl.get_pvc_size("disk1-chi-test-021-rescale-volume-simple-0-0-0") assert size == "100Mi" with When("Re-scale volume configuration to 200Mb"): kubectl.create_and_check( config="configs/test-021-rescale-volume-02-enlarge-disk.yaml", check={ "pod_count": 1, "do_not_delete": 1, }) with Then("Storage size should be 200Mi"): size = kubectl.get_pvc_size("disk1-chi-test-021-rescale-volume-simple-0-0-0") assert size == "200Mi" with When("Add second disk 50Mi"): kubectl.create_and_check( config="configs/test-021-rescale-volume-03-add-disk.yaml", check={ "pod_count": 1, "pod_volumes": { "/var/lib/clickhouse", "/var/lib/clickhouse2", }, "do_not_delete": 1, }) with Then("There should be two PVC"): size = kubectl.get_pvc_size("disk1-chi-test-021-rescale-volume-simple-0-0-0") assert size == "200Mi" size = kubectl.get_pvc_size("disk2-chi-test-021-rescale-volume-simple-0-0-0") assert size == "50Mi" with And("There should be two disks recognized by ClickHouse"): # ClickHouse requires some time to mount volume. Race conditions. time.sleep(120) out = clickhouse.query(chi, "SELECT count() FROM system.disks") print("SELECT count() FROM system.disks RETURNED:") print(out) assert out == "2" kubectl.delete_chi(chi)
def test_insert_related_alerts(self): clickhouse.create_table_on_cluster(chi) delayed_pod, delayed_svc, rejected_pod, rejected_svc = alerts.random_pod_choice_for_callbacks( chi) prometheus_scrape_interval = settings.prometheus_scrape_interval # default values in system.merge_tree_settings parts_to_throw_insert = 300 parts_to_delay_insert = 150 chi_name = chi["metadata"]["name"] parts_limits = parts_to_delay_insert selected_svc = delayed_svc def insert_many_parts_to_clickhouse(): stop_merges = "SYSTEM STOP MERGES default.test;" min_block = "SET max_block_size=1; SET max_insert_block_size=1; SET min_insert_block_size_rows=1;" with When(f"Insert to MergeTree table {parts_limits} parts"): r = parts_limits sql = stop_merges + min_block + f"INSERT INTO default.test(event_time, test) SELECT now(),number FROM system.numbers LIMIT {r};" clickhouse.query(chi_name, sql, host=selected_svc, ns=kubectl.namespace) # @TODO we need only one query after resolve https://github.com/ClickHouse/ClickHouse/issues/11384 and switch to 21.3+ sql = min_block + "INSERT INTO default.test(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1;" clickhouse.query_with_error(chi_name, sql, host=selected_svc, ns=kubectl.namespace) with Then( f"wait prometheus_scrape_interval={prometheus_scrape_interval}*2 sec" ): time.sleep(prometheus_scrape_interval * 2) sql = min_block + "INSERT INTO default.test(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1;" clickhouse.query_with_error(chi_name, sql, host=selected_svc, ns=kubectl.namespace) insert_many_parts_to_clickhouse() with Then("check ClickHouseDelayedInsertThrottling firing"): fired = alerts.wait_alert_state("ClickHouseDelayedInsertThrottling", "firing", True, labels={"hostname": delayed_svc}, time_range="60s") assert fired, error( "can't get ClickHouseDelayedInsertThrottling alert in firing state" ) with Then("check ClickHouseMaxPartCountForPartition firing"): fired = alerts.wait_alert_state("ClickHouseMaxPartCountForPartition", "firing", True, labels={"hostname": delayed_svc}, time_range="90s") assert fired, error( "can't get ClickHouseMaxPartCountForPartition alert in firing state" ) with Then("check ClickHouseLowInsertedRowsPerQuery firing"): fired = alerts.wait_alert_state( "ClickHouseLowInsertedRowsPerQuery", "firing", True, labels={"hostname": delayed_svc}, time_range="120s", ) assert fired, error( "can't get ClickHouseLowInsertedRowsPerQuery alert in firing state" ) clickhouse.query(chi_name, "SYSTEM START MERGES default.test", host=selected_svc, ns=kubectl.namespace) with Then("check ClickHouseDelayedInsertThrottling gone away"): resolved = alerts.wait_alert_state("ClickHouseDelayedInsertThrottling", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error( "can't check ClickHouseDelayedInsertThrottling alert is gone away") with Then("check ClickHouseMaxPartCountForPartition gone away"): resolved = alerts.wait_alert_state( "ClickHouseMaxPartCountForPartition", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error( "can't check ClickHouseMaxPartCountForPartition alert is gone away" ) with Then("check ClickHouseLowInsertedRowsPerQuery gone away"): resolved = alerts.wait_alert_state("ClickHouseLowInsertedRowsPerQuery", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error( "can't check ClickHouseLowInsertedRowsPerQuery alert is gone away") parts_limits = parts_to_throw_insert selected_svc = rejected_svc insert_many_parts_to_clickhouse() with Then("check ClickHouseRejectedInsert firing"): fired = alerts.wait_alert_state( "ClickHouseRejectedInsert", "firing", True, labels={"hostname": rejected_svc}, time_range="30s", sleep_time=settings.prometheus_scrape_interval) assert fired, error( "can't get ClickHouseRejectedInsert alert in firing state") with Then("check ClickHouseRejectedInsert gone away"): resolved = alerts.wait_alert_state("ClickHouseRejectedInsert", "firing", False, labels={"hostname": rejected_svc}) assert resolved, error( "can't check ClickHouseRejectedInsert alert is gone away") clickhouse.query(chi_name, "SYSTEM START MERGES default.test", host=selected_svc, ns=kubectl.namespace) clickhouse.drop_table_on_cluster(chi)
def test_read_only_replica(self): read_only_pod, read_only_svc, other_pod, other_svc = alerts.random_pod_choice_for_callbacks( chi) chi_name = chi["metadata"]["name"] clickhouse.create_table_on_cluster( chi, 'all-replicated', 'default.test_repl', '(event_time DateTime, test UInt64) ' + 'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()' ) def restart_zookeeper(): kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"", ok_to_fail=True, ) clickhouse.query_with_error( chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc) with Then("check ClickHouseReadonlyReplica firing"): fired = alerts.wait_alert_state( "ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc}, time_range='30s', sleep_time=settings.prometheus_scrape_interval, callback=restart_zookeeper) assert fired, error( "can't get ClickHouseReadonlyReplica alert in firing state") with Then("check ClickHouseReadonlyReplica gone away"): resolved = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc}) assert resolved, error( "can't check ClickHouseReadonlyReplica alert is gone away") kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace) kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true", ns=kubectl.namespace) for i in range(11): zookeeper_status = kubectl.launch( f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"echo ruok | nc 127.0.0.1 2181\"", ok_to_fail=True) if "imok" in zookeeper_status: break elif i == 10: fail(f"invalid zookeeper status after {i} retries") with Then("zookeper is not ready, wait 2 seconds"): time.sleep(2) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=read_only_svc, timeout=240) clickhouse.query_with_error( chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl", host=other_svc, timeout=240) clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')
def test_metrics_exporter_reboot(): def check_monitoring_chi(operator_namespace, operator_pod, expect_result, max_retries=10): with And( f"metrics-exporter /chi enpoint result should return {expect_result}" ): for i in range(1, max_retries): out = kubectl.kubectl( f"exec {operator_pod} -c metrics-exporter wget -- -O- -q http://127.0.0.1:8888/chi", ns=operator_namespace) out = json.loads(out) if out == expect_result: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert out == expect_result, error() with Given("clickhouse-operator is installed"): kubectl.kube_wait_field("pods", "-l app=clickhouse-operator", ".status.containerStatuses[*].ready", "true,true", ns=settings.operator_namespace) assert kubectl.kube_get_count( "pod", ns='--all-namespaces', label="-l app=clickhouse-operator") > 0, error() out = kubectl.kubectl("get pods -l app=clickhouse-operator", ns=settings.operator_namespace).splitlines()[1] operator_pod = re.split(r'[\t\r\n\s]+', out)[0] operator_namespace = settings.operator_namespace kubectl.kube_deletens(kubectl.namespace) kubectl.kube_createns(kubectl.namespace) check_monitoring_chi(operator_namespace, operator_pod, []) with And("created simple clickhouse installation"): config = kubectl.get_full_path( "../docs/chi-examples/01-simple-layout-01-1shard-1repl.yaml") kubectl.create_and_check(config, { "object_counts": [1, 1, 2], "do_not_delete": True }) expected_chi = [{ "namespace": "test", "name": "simple-01", "hostnames": ["chi-simple-01-cluster-0-0.test.svc.cluster.local"] }] check_monitoring_chi(operator_namespace, operator_pod, expected_chi) with When("reboot metrics exporter"): kubectl.kubectl( f"exec -n {operator_namespace} {operator_pod} -c metrics-exporter reboot" ) time.sleep(15) kubectl.kube_wait_field("pods", "-l app=clickhouse-operator", ".status.containerStatuses[*].ready", "true,true", ns=settings.operator_namespace) with Then("check metrics exporter still contains chi objects"): check_monitoring_chi(operator_namespace, operator_pod, expected_chi) kubectl.kube_delete(config) check_monitoring_chi(operator_namespace, operator_pod, [])
def test_distributed_files_to_insert(self): delayed_pod, delayed_svc, restarted_pod, restarted_svc = alerts.random_pod_choice_for_callbacks( chi) clickhouse.create_distributed_table_on_cluster(chi) insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1000' clickhouse.query(chi["metadata"]["name"], 'SYSTEM STOP DISTRIBUTED SENDS default.test_distr', pod=delayed_pod, ns=kubectl.namespace) files_to_insert_from_metrics = 0 files_to_insert_from_disk = 0 tries = 0 # we need more than 50 delayed files for catch while files_to_insert_from_disk <= 55 and files_to_insert_from_metrics <= 55 and tries < 500: kubectl.launch( f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse-pod -- kill 1", ok_to_fail=True, ) clickhouse.query(chi["metadata"]["name"], insert_sql, pod=delayed_pod, host=delayed_pod, ns=kubectl.namespace) files_to_insert_from_metrics = clickhouse.query( chi["metadata"]["name"], "SELECT value FROM system.metrics WHERE metric='DistributedFilesToInsert'", pod=delayed_pod, ns=kubectl.namespace) files_to_insert_from_metrics = int(files_to_insert_from_metrics) files_to_insert_from_disk = int( kubectl.launch( f"exec -n {kubectl.namespace} {delayed_pod} -c clickhouse-pod -- bash -c 'ls -la /var/lib/clickhouse/data/default/test_distr/*/*.bin 2>/dev/null | wc -l'", ok_to_fail=False, )) with When("reboot clickhouse-server pod"): fired = alerts.wait_alert_state( "ClickHouseDistributedFilesToInsertHigh", "firing", True, labels={ "hostname": delayed_svc, "chi": chi["metadata"]["name"] }) assert fired, error( "can't get ClickHouseDistributedFilesToInsertHigh alert in firing state" ) kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace) clickhouse.query(chi["metadata"]["name"], 'SYSTEM START DISTRIBUTED SENDS default.test_distr', pod=delayed_pod, ns=kubectl.namespace) with Then("check ClickHouseDistributedFilesToInsertHigh gone away"): resolved = alerts.wait_alert_state( "ClickHouseDistributedFilesToInsertHigh", "firing", False, labels={"hostname": delayed_svc}) assert resolved, error( "can't check ClickHouseDistributedFilesToInsertHigh alert is gone away" ) clickhouse.drop_distributed_table_on_cluster(chi)
def check_datatype(connection, datatype, values, nullable=False, quote=False, repr=str, encoding="utf-8", expected=None): """Check support for a data type. """ if expected is None: expected = dict() if nullable: datatype = f"Nullable({datatype})" values.append(NULL) if expected: expected["all"] = expected['all'].rsplit("]", 1)[0] + ", (None, )]" expected[NULL] = "[(None, )]" with Given("PyODBC connection"): with Given(f"parameters", description=f""" values {values} expected data {expected} """): with Given(f"table with a column of data type {datatype}"): connection.query("DROP TABLE IF EXISTS ps", fetch=False) connection.query( f"CREATE TABLE ps (v {datatype}) ENGINE = Memory", fetch=False) try: connection.connection.setencoding(encoding=encoding) for v in values: with When(f"I insert value {repr(v)}", flags=TE): # connection.query("INSERT INTO ps VALUES (?)", [v], fetch=False) if quote: connection.query( f"INSERT INTO ps VALUES ('{repr(v)}')", fetch=False) else: connection.query( f"INSERT INTO ps VALUES ({repr(v)})", fetch=False) with When("I select all values", flags=TE): rows = connection.query("SELECT * FROM ps ORDER BY v") if expected.get("all") is not None: with Then(f"the result is {expected.get('all')}", flags=TE): assert repr(rows) == expected.get( "all"), error("result did not match") with When(f"I have values {repr(values)}"): for v in values: if v is NULL: # comparing to NULL is not valid in SQL continue with When(f"I select value {repr(v)}", flags=TE): rows = connection.query( "SELECT * FROM ps WHERE v = ? ORDER BY v", [v]) if expected.get(v) is not None: with Then( f"the result is {repr(expected.get(v))}", flags=TE): assert repr(rows) == expected.get( v), error("result did not match") finally: connection.connection.setencoding( encoding=connection.encoding) connection.query("DROP TABLE ps", fetch=False)
def wait_backup_pod_ready_and_curl_installed(backup_pod): with Then(f"wait {backup_pod} ready"): kubectl.wait_field("pod", backup_pod, ".status.containerStatuses[1].ready", "true") kubectl.launch( f'exec {backup_pod} -c clickhouse-backup -- curl --version')
def kube_check_pod_image(chi_name, image, ns="test"): pod_image = kube_get_pod_image(chi_name, ns) with Then(f"Expect pod image {pod_image} to match {image}"): assert pod_image == image
def kube_check_service(service_name, service_type, ns="test"): with When(f"{service_name} is available"): service = kube_get("service", service_name, ns=ns) with Then(f"Service type is {service_type}"): assert service["spec"]["type"] == service_type
def kube_check_pod_ports(chi_name, ports, ns="test"): pod_ports = kube_get_pod_ports(chi_name, ns) with Then(f"Expect pod ports {pod_ports} to match {ports}"): assert pod_ports.sort() == ports.sort()
def kube_delete(config, ns="test"): with When(f"{config} is deleted"): cmd = shell(f"{kubectlcmd} delete -n {ns} -f {config}") with Then("exitcode should be 0"): assert cmd.exitcode == 0, error()
def kube_apply(config, ns="test"): with When(f"{config} is applied"): cmd = shell(f"{kubectlcmd} apply -n {ns} -f {config}") with Then("exitcode should be 0"): assert cmd.exitcode == 0, error()
def test_008(): with Then("Test simple chi for operator restart"): test_operator_restart("configs/test-008-operator-restart-1.yaml") with Then("Test advanced chi for operator restart"): test_operator_restart("configs/test-008-operator-restart-2.yaml")
def test_metrics_exporter_with_multiple_clickhouse_version(): def check_monitoring_metrics(operator_namespace, operator_pod, expect_result, max_retries=10): with And( f"metrics-exporter /metrics enpoint result should match with {expect_result}" ): for i in range(1, max_retries): out = kubectl.kubectl( f"exec {operator_pod} -c metrics-exporter wget -- -O- -q http://127.0.0.1:8888/metrics", ns=operator_namespace) all_strings_expected_done = True for string, exists in expect_result.items(): all_strings_expected_done = (exists == (string in out)) if not all_strings_expected_done: break if all_strings_expected_done: break with Then("Not ready. Wait for " + str(i * 5) + " seconds"): time.sleep(i * 5) assert all_strings_expected_done, error() with Given("clickhouse-operator pod exists"): out = kubectl.kubectl("get pods -l app=clickhouse-operator", ns='kube-system').splitlines()[1] operator_pod = re.split(r'[\t\r\n\s]+', out)[0] operator_namespace = "kube-system" with Then("check empty /metrics"): kubectl.kube_deletens(kubectl.namespace) kubectl.kube_createns(kubectl.namespace) check_monitoring_metrics( operator_namespace, operator_pod, expect_result={ 'chi_clickhouse_metric_VersionInteger': False, }) with Then("Install multiple clickhouse version"): config = kubectl.get_full_path( "configs/test-017-multi-version.yaml") kubectl.create_and_check(config, { "object_counts": [4, 4, 5], "do_not_delete": True }) with And("Check not empty /metrics"): check_monitoring_metrics( operator_namespace, operator_pod, expect_result={ '# HELP chi_clickhouse_metric_VersionInteger': True, '# TYPE chi_clickhouse_metric_VersionInteger gauge': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-0-0': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-1-0': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-2-0': True, 'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-3-0': True, }) with Then("check empty /metrics after delete namespace"): kubectl.kube_deletens(kubectl.namespace) check_monitoring_metrics( operator_namespace, operator_pod, expect_result={ 'chi_clickhouse_metric_VersionInteger': False, })
def test_009(version_from="0.11.0", version_to=settings.operator_version): with Then("Test simple chi for operator upgrade"): test_operator_upgrade("configs/test-009-operator-upgrade-1.yaml", version_from, version_to) with Then("Test advanced chi for operator upgrade"): test_operator_upgrade("configs/test-009-operator-upgrade-2.yaml", version_from, version_to)
def test_backup_duration(self): short_pod, _, long_pod, _ = alerts.random_pod_choice_for_callbacks(chi) apply_fake_backup("prepare fake backup duration metric") for pod in [short_pod, long_pod]: with Then(f"wait {pod} ready"): kubectl.wait_field("pod", pod, ".spec.containers[1].image", "nginx:latest") kubectl.wait_field("pod", pod, ".status.containerStatuses[1].ready", "true") fired = alerts.wait_alert_state( "ClickHouseBackupTooLong", "firing", expected_state=True, sleep_time=settings.prometheus_scrape_interval, labels={"pod_name": pod}, time_range='60s') assert fired, error( f"can't get ClickHouseBackupTooLong alert in firing state for {pod}" ) with Then(f"wait when prometheus will scrape fake data"): time.sleep(70) with Then(f"decrease {short_pod} backup duration"): kubectl.launch( f'exec {short_pod} -c clickhouse-backup -- bash -xc \'' 'echo "# HELP clickhouse_backup_last_create_duration Backup create duration in nanoseconds" > /usr/share/nginx/html/metrics && ' 'echo "# TYPE clickhouse_backup_last_create_duration gauge" >> /usr/share/nginx/html/metrics && ' 'echo "clickhouse_backup_last_create_duration 7000000000000" >> /usr/share/nginx/html/metrics && ' 'echo "# HELP clickhouse_backup_last_create_status Last backup create status: 0=failed, 1=success, 2=unknown" >> /usr/share/nginx/html/metrics && ' 'echo "# TYPE clickhouse_backup_last_create_status gauge" >> /usr/share/nginx/html/metrics && ' 'echo "clickhouse_backup_last_create_status 1" >> /usr/share/nginx/html/metrics' '\'') fired = alerts.wait_alert_state( "ClickHouseBackupTooShort", "firing", expected_state=True, sleep_time=settings.prometheus_scrape_interval, labels={"pod_name": short_pod}, time_range='60s') assert fired, error( "can't get ClickHouseBackupTooShort alert in firing state") apply_normal_backup() with Then("check ClickHouseBackupTooShort gone away"): resolved = alerts.wait_alert_state("ClickHouseBackupTooShort", "firing", expected_state=False, labels={"pod_name": short_pod}) assert resolved, error( "can't get ClickHouseBackupTooShort alert is gone away") with Then("check ClickHouseBackupTooLong gone away"): resolved = alerts.wait_alert_state("ClickHouseBackupTooLong", "firing", expected_state=False, labels={"pod_name": long_pod}) assert resolved, error( "can't get ClickHouseBackupTooLong alert is gone away")