def test_detached_parts(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    clickhouse.create_table_on_cluster(chi)
    detached_pod, detached_svc, _, _ = alerts.random_pod_choice_for_callbacks(chi)

    def create_part_and_detach():
        clickhouse.query(chi["metadata"]["name"], "INSERT INTO default.test SELECT now(), number FROM numbers(100)", pod=detached_pod)
        part_name = clickhouse.query(
            chi["metadata"]["name"],
            sql="SELECT name FROM system.parts WHERE database='default' AND table='test' ORDER BY modification_time DESC LIMIT 1",
            pod=detached_pod
        )
        clickhouse.query(chi["metadata"]["name"], f"ALTER TABLE default.test DETACH PART '{part_name}'", pod=detached_pod)

    def attach_all_parts():
        detached_parts = clickhouse.query(chi["metadata"]["name"], "SELECT name FROM system.detached_parts WHERE database='default' AND table='test' AND reason=''", pod=detached_pod)
        all_parts = ""
        for part in detached_parts.splitlines():
            all_parts += f"ALTER TABLE default.test ATTACH PART '{part}';"
        if all_parts.strip() != "":
            clickhouse.query(chi["metadata"]["name"], all_parts, pod=detached_pod)

    with When("check ClickHouseDetachedParts firing"):
        fired = alerts.wait_alert_state("ClickHouseDetachedParts", "firing", True,
                                 labels={"hostname": detached_svc, "chi": chi["metadata"]["name"]}, time_range='30s',
                                 callback=create_part_and_detach)
        assert fired, error("can't get ClickHouseDetachedParts alert in firing state")

    with Then("check ClickHouseDetachedParts gone away"):
        resolved = alerts.wait_alert_state("ClickHouseDetachedParts", "firing", False,
                                    labels={"hostname": detached_svc}, callback=attach_all_parts)
        assert resolved, error("can't check ClickHouseDetachedParts alert is gone away")

    clickhouse.drop_table_on_cluster(chi)
def test_read_only_replica(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    read_only_pod, read_only_svc, other_pod, other_svc = alerts.random_pod_choice_for_callbacks(chi)
    chi_name = chi["metadata"]["name"]
    clickhouse.create_table_on_cluster(
        chi,
        'all-replicated', 'default.test_repl',
        '(event_time DateTime, test UInt64) ' +
        'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()'
    )

    def restart_zookeeper():
        kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
            ok_to_fail=True,
        )
        clickhouse.query_with_error(chi_name, "INSERT INTO default.test_repl VALUES(now(),rand())", host=read_only_svc)

    with Then("check ClickHouseReadonlyReplica firing"):
        fired = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", True, labels={"hostname": read_only_svc},
                                        time_range='30s', sleep_time=settings.prometheus_scrape_interval, callback=restart_zookeeper)
        assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state")
    with Then("check ClickHouseReadonlyReplica gone away"):
        resolved = alerts.wait_alert_state("ClickHouseReadonlyReplica", "firing", False, labels={"hostname": read_only_svc})
        assert resolved, error("can't check ClickHouseReadonlyReplica alert is gone away")

    kubectl.wait_pod_status("zookeeper-0", "Running", ns=kubectl.namespace)
    kubectl.wait_jsonpath("pod", "zookeeper-0", "{.status.containerStatuses[0].ready}", "true",
                          ns=kubectl.namespace)

    for i in range(11):
        zookeeper_status = kubectl.launch(
            f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"echo ruok | nc 127.0.0.1 2181\"", ok_to_fail=True
        )
        if "imok" in zookeeper_status:
            break
        elif i == 10:
            fail(f"invalid zookeeper status after {i} retries")
        with Then("zookeper is not ready, wait 2 seconds"):
            time.sleep(2)

    clickhouse.query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=read_only_svc, timeout=240
    )
    clickhouse.query_with_error(
        chi_name, "SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=other_svc, timeout=240
    )

    clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')
Example #3
0
 def insert_replicated_data(chi, create_tables, insert_tables):
     with When(f'create if not exists replicated tables {create_tables}'):
         for table in create_tables:
             clickhouse.create_table_on_cluster(
                 chi,
                 'all-sharded',
                 f'default.{table}',
                 f'(id UInt64) ENGINE=ReplicatedMergeTree(\'/clickhouse/tables/default.{table}/{{shard}}\',\'{{replica}}\') ORDER BY (id)',
                 if_not_exists=True,
             )
     with When(f'insert tables data {insert_tables}'):
         for table in insert_tables:
             clickhouse.query(
                 chi['metadata']['name'],
                 f'INSERT INTO default.{table} SELECT rand()+number FROM numbers(1000)',
                 pod="chi-test-cluster-for-zk-default-0-1-0")
def test_replicas_max_absolute_delay(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    stop_replica_pod, stop_replica_svc, insert_pod, insert_svc = alerts.random_pod_choice_for_callbacks(chi)
    clickhouse.create_table_on_cluster(
        chi,
        'all-replicated', 'default.test_repl',
        '(event_time DateTime, test UInt64) ' +
        'ENGINE ReplicatedMergeTree(\'/clickhouse/tables/{installation}-{shard}/test_repl\', \'{replica}\') ORDER BY tuple()'
    )
    prometheus_scrape_interval = 15

    def restart_clickhouse_and_insert_to_replicated_table():
        with When(f"stop replica fetches on {stop_replica_svc}"):
            sql = "SYSTEM STOP FETCHES default.test_repl"
            kubectl.launch(
                f"exec -n {kubectl.namespace} {stop_replica_pod} -c clickhouse-pod -- clickhouse-client -q \"{sql}\"",
                ok_to_fail=True, timeout=600,
            )
            sql = "INSERT INTO default.test_repl SELECT now(), number FROM numbers(100000)"
            kubectl.launch(
                f"exec -n {kubectl.namespace} {insert_pod} -c clickhouse-pod -- clickhouse-client -q \"{sql}\"",
            )

    with Then("check ClickHouseReplicasMaxAbsoluteDelay firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseReplicasMaxAbsoluteDelay", "firing", True, labels={"hostname": stop_replica_svc},
            time_range='60s', sleep_time=prometheus_scrape_interval * 2,
            callback=restart_clickhouse_and_insert_to_replicated_table
        )
        assert fired, error("can't get ClickHouseReadonlyReplica alert in firing state")

    clickhouse.query(
        chi["metadata"]["name"],
        "SYSTEM START FETCHES; SYSTEM RESTART REPLICAS; SYSTEM SYNC REPLICA default.test_repl",
        host=stop_replica_svc, timeout=240
    )
    with Then("check ClickHouseReplicasMaxAbsoluteDelay gone away"):
        resolved = alerts.wait_alert_state("ClickHouseReplicasMaxAbsoluteDelay", "firing", False, labels={"hostname": stop_replica_svc})
        assert resolved, error("can't check ClickHouseReplicasMaxAbsoluteDelay alert is gone away")

    clickhouse.drop_table_on_cluster(chi, 'all-replicated', 'default.test_repl')
def test_insert_related_alerts(self, prometheus_operator_spec, clickhouse_operator_spec, chi):
    clickhouse.create_table_on_cluster(chi)
    delayed_pod, delayed_svc, rejected_pod, rejected_svc = alerts.random_pod_choice_for_callbacks(chi)

    prometheus_scrape_interval = settings.prometheus_scrape_interval
    # default values in system.merge_tree_settings
    parts_to_throw_insert = 300
    parts_to_delay_insert = 150
    chi_name = chi["metadata"]["name"]

    parts_limits = parts_to_delay_insert
    selected_svc = delayed_svc

    def insert_many_parts_to_clickhouse():
        stop_merges = "SYSTEM STOP MERGES default.test;"
        min_block = "SET max_block_size=1; SET max_insert_block_size=1; SET min_insert_block_size_rows=1;"
        with When(f"Insert to MergeTree table {parts_limits} parts"):
            r = parts_limits
            sql = stop_merges + min_block + f"INSERT INTO default.test(event_time, test) SELECT now(),number FROM system.numbers LIMIT {r};"
            clickhouse.query(chi_name, sql, host=selected_svc, ns=kubectl.namespace)

            sql = min_block + "INSERT INTO default.test(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1;"
            clickhouse.query_with_error(chi_name, sql, host=selected_svc, ns=kubectl.namespace)
            with Then(f"wait prometheus_scrape_interval={prometheus_scrape_interval}*2 sec"):
                time.sleep(prometheus_scrape_interval * 2)

            with Then("after 21.8 InsertedRows include system.* rows"):
                for i in range(35):
                    sql = min_block + "INSERT INTO default.test(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1;"
                    clickhouse.query_with_error(chi_name, sql, host=selected_svc, ns=kubectl.namespace)

    insert_many_parts_to_clickhouse()
    with Then("check ClickHouseDelayedInsertThrottling firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseDelayedInsertThrottling", "firing", True, labels={"hostname": delayed_svc}, time_range="60s"
        )
        assert fired, error("can't get ClickHouseDelayedInsertThrottling alert in firing state")
    with Then("check ClickHouseMaxPartCountForPartition firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseMaxPartCountForPartition", "firing", True, labels={"hostname": delayed_svc}, time_range="90s"
        )
        assert fired, error("can't get ClickHouseMaxPartCountForPartition alert in firing state")
    with Then("check ClickHouseLowInsertedRowsPerQuery firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseLowInsertedRowsPerQuery", "firing", True, labels={"hostname": delayed_svc}, time_range="120s",
        )
        assert fired, error("can't get ClickHouseLowInsertedRowsPerQuery alert in firing state")

    clickhouse.query(chi_name, "SYSTEM START MERGES default.test", host=selected_svc, ns=kubectl.namespace)

    with Then("check ClickHouseDelayedInsertThrottling gone away"):
        resolved = alerts.wait_alert_state("ClickHouseDelayedInsertThrottling", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseDelayedInsertThrottling alert is gone away")
    with Then("check ClickHouseMaxPartCountForPartition gone away"):
        resolved = alerts.wait_alert_state("ClickHouseMaxPartCountForPartition", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseMaxPartCountForPartition alert is gone away")
    with Then("check ClickHouseLowInsertedRowsPerQuery gone away"):
        resolved = alerts.wait_alert_state("ClickHouseLowInsertedRowsPerQuery", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseLowInsertedRowsPerQuery alert is gone away")

    parts_limits = parts_to_throw_insert
    selected_svc = rejected_svc
    insert_many_parts_to_clickhouse()
    with Then("check ClickHouseRejectedInsert firing"):
        fired = alerts.wait_alert_state("ClickHouseRejectedInsert", "firing", True, labels={"hostname": rejected_svc}, time_range="30s",
                                        sleep_time=settings.prometheus_scrape_interval)
        assert fired, error("can't get ClickHouseRejectedInsert alert in firing state")

    with Then("check ClickHouseRejectedInsert gone away"):
        resolved = alerts.wait_alert_state("ClickHouseRejectedInsert", "firing", False, labels={"hostname": rejected_svc})
        assert resolved, error("can't check ClickHouseRejectedInsert alert is gone away")

    clickhouse.query(chi_name, "SYSTEM START MERGES default.test", host=selected_svc, ns=kubectl.namespace)
    clickhouse.drop_table_on_cluster(chi)