def reboot_metrics_exporter():
     clickhouse_operator_pod = clickhouse_operator_spec["items"][0][
         "metadata"]["name"]
     kubectl.launch(
         f"exec -n {settings.operator_namespace} {clickhouse_operator_pod} -c metrics-exporter -- reboot",
         ok_to_fail=True,
     )
 def restart_zookeeper():
     kubectl.launch(
         f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
         ok_to_fail=True,
     )
     clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc1)
     clickhouse.query_with_error(chi_name, "SELECT name, path FROM system.zookeeper WHERE path='/'", host=svc2)
    def make_too_many_connection():
        long_cmd = ""
        for _ in range(120):
            port = random.choice(["8123", "3306", "9000"])
            if port == "8123":
                # HTTPConnection metric increase after full parsing of HTTP Request, we can't provide pause between CONNECT and QUERY running
                # long_cmd += f"nc -vv 127.0.0.1 {port} <( printf \"POST / HTTP/1.1\\r\\nHost: 127.0.0.1:8123\\r\\nContent-Length: 34\\r\\n\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\");"
                long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),number,now() FROM numbers(30)";'
            elif port == "9000":
                long_cmd += 'clickhouse-client --idle_connection_timeout 70 --receive_timeout 70 -q "SELECT sleepEachRow(1),number,now() FROM numbers(30)";'
            # elif port == "3306":
            #     long_cmd += 'mysql -u default -h 127.0.0.1 -e "SELECT sleepEachRow(1),number, now() FROM numbers(30)";'
            else:
                long_cmd += f"printf \"1\\n1\" | nc -q 5 -i 30 -vv 127.0.0.1 {port};"

        nc_cmd = f"echo '{long_cmd} exit 0' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 120 bash -c '{{}}' 1>/dev/null"
        with open("/tmp/nc_cmd.sh", "w") as f:
            f.write(nc_cmd)

        kubectl.launch(
            f"cp /tmp/nc_cmd.sh {too_many_connection_pod}:/tmp/nc_cmd.sh -c clickhouse"
        )

        kubectl.launch(
            f"exec -n {kubectl.namespace} {too_many_connection_pod} -c clickhouse -- bash /tmp/nc_cmd.sh",
            timeout=600,
        )
def test_metrics_exporter_reboot():
    def check_monitoring_chi(operator_namespace, operator_pod, expect_result, max_retries=10):
        with And(f"metrics-exporter /chi enpoint result should return {expect_result}"):
            for i in range(1, max_retries):
                # check /metrics for try to refresh monitored instances
                kubectl.launch(
                    f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics",
                    ns=operator_namespace
                )
                # check /chi after refresh monitored instances
                out = kubectl.launch(
                    f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/chi",
                    ns=operator_namespace
                )
                out = json.loads(out)
                if out == expect_result:
                    break
                with Then("Not ready. Wait for " + str(i * 5) + " seconds"):
                    time.sleep(i * 5)
            assert out == expect_result, error()

    with Given("clickhouse-operator is installed"):
        kubectl.wait_field("pods", "-l app=clickhouse-operator", ".status.containerStatuses[*].ready", "true,true",
                           ns=settings.operator_namespace)
        assert kubectl.get_count("pod", ns='--all-namespaces', label="-l app=clickhouse-operator") > 0, error()

        out = kubectl.launch("get pods -l app=clickhouse-operator", ns=settings.operator_namespace).splitlines()[1]
        operator_pod = re.split(r'[\t\r\n\s]+', out)[0]
        operator_namespace = settings.operator_namespace
        kubectl.delete_ns(kubectl.namespace)
        kubectl.create_ns(kubectl.namespace)
        check_monitoring_chi(operator_namespace, operator_pod, [])
        with And("created simple clickhouse installation"):
            config = util.get_full_path("../docs/chi-examples/01-simple-layout-01-1shard-1repl.yaml")
            kubectl.create_and_check(
                config=config,
                check={
                    "object_counts": {
                        "statefulset": 1,
                        "pod": 1,
                        "service": 2,
                    },
                    "do_not_delete": True,
                })
            expected_chi = [{
                "namespace": "test", "name": "simple-01",
                "hostnames": ["chi-simple-01-cluster-0-0.test.svc.cluster.local"]
            }]
            check_monitoring_chi(operator_namespace, operator_pod, expected_chi)
            with When("reboot metrics exporter"):
                kubectl.launch(f"exec -n {operator_namespace} {operator_pod} -c metrics-exporter -- reboot")
                time.sleep(15)
                kubectl.wait_field("pods", "-l app=clickhouse-operator",
                                        ".status.containerStatuses[*].ready", "true,true",
                                   ns=settings.operator_namespace)
                with Then("check metrics exporter still contains chi objects"):
                    check_monitoring_chi(operator_namespace, operator_pod, expected_chi)
                    kubectl.delete(config)
                    check_monitoring_chi(operator_namespace, operator_pod, [])
Ejemplo n.º 5
0
 def create_success_backup():
     backup_name = backup_prefix + "-" + str(random.randint(1, 4096))
     kubectl.launch(
         f"exec -n {settings.test_namespace} {backup_pod} -c clickhouse-backup -- curl -X POST -sL http://127.0.0.1:7171/backup/create?name={backup_name}",
     )
     wait_backup_command_status(backup_pod,
                                command_name=f'create {backup_name}',
                                expected_status='success')
 def restart_zookeeper():
     kubectl.launch(
         f"exec -n {kubectl.namespace} zookeeper-0 -- sh -c \"kill 1\"",
         ok_to_fail=True,
     )
     clickhouse.query_with_error(
         chi_name,
         "INSERT INTO default.test_repl VALUES(now(),rand())",
         host=read_only_svc)
Ejemplo n.º 7
0
def test_backup_not_run(self):
    not_run_pod, _, _, _ = alerts.random_pod_choice_for_callbacks(chi)
    apply_fake_backup("prepare fake backup for time metric")

    with Then(f"wait {not_run_pod} ready"):
        kubectl.wait_field("pod", not_run_pod, ".spec.containers[1].image",
                           "nginx:latest")
        kubectl.wait_field("pod", not_run_pod,
                           ".status.containerStatuses[1].ready", "true")

    with Then(f"setup {not_run_pod} backup create end time"):
        kubectl.launch(
            f'exec {not_run_pod} -c clickhouse-backup -- bash -xc \''
            'echo "# HELP clickhouse_backup_last_create_finish Last backup create finish timestamp" > /usr/share/nginx/html/metrics && '
            'echo "# TYPE clickhouse_backup_last_create_finish gauge" >> /usr/share/nginx/html/metrics && '
            f'echo "clickhouse_backup_last_create_finish {int((datetime.datetime.now() - datetime.timedelta(days=2)).timestamp())}" >> /usr/share/nginx/html/metrics '
            '\'')

        fired = alerts.wait_alert_state(
            "ClickhouseBackupDoesntRunTooLong",
            "firing",
            expected_state=True,
            sleep_time=settings.prometheus_scrape_interval,
            labels={"pod_name": not_run_pod},
            time_range='60s')
        assert fired, error(
            "can't get ClickhouseBackupDoesntRunTooLong alert in firing state")

    apply_normal_backup()

    backup_name = prepare_table_for_backup(not_run_pod)
    wait_backup_pod_ready_and_curl_installed(not_run_pod)

    with When('Backup is success'):
        exec_on_backup_container(
            not_run_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/create?name={backup_name}"'
        )
        wait_backup_command_status(not_run_pod,
                                   f'create {backup_name}',
                                   expected_status='success')

        exec_on_backup_container(
            not_run_pod,
            f'curl -X POST -sL "http://127.0.0.1:7171/backup/upload/{backup_name}"'
        )
        wait_backup_command_status(not_run_pod,
                                   f'upload {backup_name}',
                                   expected_status='success')

    with Then("check ClickhouseBackupDoesntRunTooLong gone away"):
        resolved = alerts.wait_alert_state("ClickhouseBackupDoesntRunTooLong",
                                           "firing",
                                           expected_state=False,
                                           labels={"pod_name": not_run_pod})
        assert resolved, error(
            "can't get ClickhouseBackupDoesntRunTooLong alert is gone away")
def test_too_many_connections(self):
    too_many_connection_pod, too_many_connection_svc, _, _ = alerts.random_pod_choice_for_callbacks(
        chi)
    cmd = "export DEBIAN_FRONTEND=noninteractive; apt-get update; apt-get install -y netcat mysql-client"
    kubectl.launch(
        f"exec -n {kubectl.namespace} {too_many_connection_pod} -c clickhouse-pod -- bash -c  \"{cmd}\"",
        timeout=120,
    )

    def make_too_many_connection():
        long_cmd = ""
        for _ in range(120):
            port = random.choice(["8123", "3306", "3306", "3306", "9000"])
            if port == "8123":
                # HTTPConnection metric increase after full parsing of HTTP Request, we can't provide pause between CONNECT and QUERY running
                # long_cmd += f"nc -vv 127.0.0.1 {port} <( printf \"POST / HTTP/1.1\\r\\nHost: 127.0.0.1:8123\\r\\nContent-Length: 34\\r\\n\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\\r\\nTEST\");"
                long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),number,now() FROM numbers(30)";'
            elif port == "9000":
                long_cmd += 'clickhouse-client --send_logs_level information --idle_connection_timeout 70 --receive_timeout 70 -q "SELECT sleepEachRow(1),number,now() FROM numbers(30)";'
            # elif port == "3306":
            #     long_cmd += 'mysql -u default -h 127.0.0.1 -e "SELECT sleepEachRow(1),number, now() FROM numbers(30)";'
            else:
                long_cmd += f"printf \"1\\n1\" | nc -q 5 -i 30 -vv 127.0.0.1 {port};"

        nc_cmd = f"echo '{long_cmd} whereis nc; exit 0' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 120 bash -c '{{}}' 1>/dev/null"
        with open("/tmp/nc_cmd.sh", "w") as f:
            f.write(nc_cmd)

        kubectl.launch(
            f"cp /tmp/nc_cmd.sh {too_many_connection_pod}:/tmp/nc_cmd.sh -c clickhouse-pod"
        )

        kubectl.launch(
            f"exec -n {kubectl.namespace} {too_many_connection_pod} -c clickhouse-pod -- bash /tmp/nc_cmd.sh",
            timeout=600,
        )

    with Then("check ClickHouseTooManyConnections firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseTooManyConnections",
            "firing",
            True,
            labels={"hostname": too_many_connection_svc},
            time_range='90s',
            callback=make_too_many_connection)
        assert fired, error(
            "can't get ClickHouseTooManyConnections alert in firing state")

    with Then("check ClickHouseTooManyConnections gone away"):
        resolved = alerts.wait_alert_state(
            "ClickHouseTooManyConnections",
            "firing",
            False,
            labels={"hostname": too_many_connection_svc})
        assert resolved, error(
            "can't check ClickHouseTooManyConnections alert is gone away")
 def rewrite_dns_on_clickhouse_server(write_new=True):
     dns = new_dns if write_new else old_dns
     kubectl.launch(
         f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- bash -c \"printf \\\"{dns}\\\" > /etc/resolv.conf\"",
         ok_to_fail=False,
     )
     kubectl.launch(
         f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- clickhouse-client --echo -mn -q \"SYSTEM DROP DNS CACHE; SELECT count() FROM cluster('all-sharded',system,metrics)\"",
         ok_to_fail=True,
     )
 def run_queries_with_priority():
     sql = ""
     for i in range(50):
         sql += f"SET priority={i % 20};SELECT uniq(number) FROM numbers(20000000):"
     cmd = f"echo \\\"{sql} SELECT 1\\\" | xargs -i'{{}}' --no-run-if-empty -d ':' -P 20 clickhouse-client --time -m -n -q \\\"{{}}\\\""
     kubectl.launch(f"exec {priority_pod} -- bash -c \"{cmd}\"", timeout=120)
     clickhouse.query(
         chi["metadata"]["name"],
         "SELECT event_time, CurrentMetric_QueryPreempted FROM system.metric_log WHERE CurrentMetric_QueryPreempted > 0",
         host=priority_svc,
     )
 def restart_clickhouse_and_insert_to_replicated_table():
     with When(f"stop replica fetches on {stop_replica_svc}"):
         sql = "SYSTEM STOP FETCHES default.test_repl"
         kubectl.launch(
             f"exec -n {kubectl.namespace} {stop_replica_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"",
             ok_to_fail=True,
         )
         sql = "INSERT INTO default.test_repl SELECT now(), number FROM numbers(100000)"
         kubectl.launch(
             f"exec -n {kubectl.namespace} {insert_pod} -c clickhouse -- clickhouse-client -q \"{sql}\"",
         )
def test_too_much_running_queries(self):
    _, _, too_many_queries_pod, too_many_queries_svc = alerts.random_pod_choice_for_callbacks(
        chi)
    cmd = "export DEBIAN_FRONTEND=noninteractive; apt-get update; apt-get install -y mysql-client"
    kubectl.launch(
        f"exec -n {kubectl.namespace} {too_many_queries_pod} -c clickhouse-pod -- bash -c  \"{cmd}\"",
        ok_to_fail=True,
    )

    def make_too_many_queries():
        long_cmd = ""
        for _ in range(90):
            port = random.choice(["8123", "3306", "9000"])
            if port == "9000":
                long_cmd += 'clickhouse-client -q "SELECT sleepEachRow(1),now() FROM numbers(60)";'
            if port == "3306":
                long_cmd += 'mysql -h 127.0.0.1 -P 3306 -u default -e "SELECT sleepEachRow(1),now() FROM numbers(60)";'
            if port == "8123":
                long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),now() FROM numbers(60)";'

        long_cmd = f"echo '{long_cmd}' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 100 bash -c '{{}}' 1>/dev/null"
        with open("/tmp/long_cmd.sh", "w") as f:
            f.write(long_cmd)

        kubectl.launch(
            f"cp /tmp/long_cmd.sh {too_many_queries_pod}:/tmp/long_cmd.sh -c clickhouse-pod"
        )
        kubectl.launch(
            f"exec -n {kubectl.namespace} {too_many_queries_pod} -c clickhouse-pod -- bash /tmp/long_cmd.sh",
            timeout=90,
        )

    with Then("check ClickHouseTooManyRunningQueries firing"):
        fired = alerts.wait_alert_state(
            "ClickHouseTooManyRunningQueries",
            "firing",
            True,
            labels={"hostname": too_many_queries_svc},
            callback=make_too_many_queries,
            time_range="30s")
        assert fired, error(
            "can't get ClickHouseTooManyConnections alert in firing state")

    with Then("check ClickHouseTooManyConnections gone away"):
        resolved = alerts.wait_alert_state(
            "ClickHouseTooManyRunningQueries",
            "firing",
            False,
            labels={"hostname": too_many_queries_svc},
            sleep_time=settings.prometheus_scrape_interval)
        assert resolved, error(
            "can't check ClickHouseTooManyConnections alert is gone away")
Ejemplo n.º 13
0
    def create_fail_backup():
        backup_name = backup_prefix + "-" + str(random.randint(1, 4096))
        backup_dir = f"/var/lib/clickhouse/backup/{backup_name}/shadow/default/test_backup"
        kubectl.launch(
            f"exec -n {settings.test_namespace} {backup_pod} -c clickhouse-backup -- bash -c 'mkdir -v -m 0400 -p {backup_dir}'",
        )

        kubectl.launch(
            f"exec -n {settings.test_namespace} {backup_pod} -c clickhouse-backup -- curl -X POST -sL http://127.0.0.1:7171/backup/create?name={backup_name}",
        )
        wait_backup_command_status(backup_pod,
                                   command_name=f'create {backup_name}',
                                   expected_status='error')
Ejemplo n.º 14
0
def restart_operator(ns=settings.operator_namespace, timeout=60):
    pod_name = kubectl.get(
        "pod", name="", ns=ns,
        label="-l app=clickhouse-operator")["items"][0]["metadata"]["name"]
    kubectl.launch(f"delete pod {pod_name}", ns=ns, timeout=timeout)
    kubectl.wait_object("pod",
                        name="",
                        ns=ns,
                        label="-l app=clickhouse-operator")
    pod_name = kubectl.get(
        "pod", name="", ns=ns,
        label="-l app=clickhouse-operator")["items"][0]["metadata"]["name"]
    kubectl.wait_pod_status(pod_name, "Running", ns=ns)
    def reboot_clickhouse_and_distributed_exection():
        # we need 70 delayed files for catch
        insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 10000'
        select_sql = 'SELECT count() FROM default.test_distr'
        with Then("reboot clickhouse-server pod"):
            kubectl.launch(
                f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1",
                ok_to_fail=True,
            )
            with And("Insert to distributed table"):
                clickhouse.query(chi["metadata"]["name"], insert_sql, host=delayed_pod, ns=kubectl.namespace)

            with And("Select from distributed table"):
                clickhouse.query_with_error(chi["metadata"]["name"], select_sql, host=delayed_pod,
                                            ns=kubectl.namespace)
def test_distributed_files_to_insert():
    delayed_pod, delayed_svc, restarted_pod, restarted_svc = random_pod_choice_for_callbacks()
    create_distributed_table_on_cluster()

    insert_sql = 'INSERT INTO default.test_distr(event_time, test) SELECT now(), number FROM system.numbers LIMIT 1000'
    clickhouse.query(
        chi["metadata"]["name"], 'SYSTEM STOP DISTRIBUTED SENDS default.test_distr',
        pod=delayed_pod, ns=kubectl.namespace
    )

    files_to_insert_from_metrics = 0
    files_to_insert_from_disk = 0
    tries = 0
    # we need more than 50 delayed files for catch
    while files_to_insert_from_disk <= 55 and files_to_insert_from_metrics <= 55 and tries < 500:
        kubectl.launch(
            f"exec -n {kubectl.namespace} {restarted_pod} -c clickhouse -- kill 1",
            ok_to_fail=True,
        )
        clickhouse.query(chi["metadata"]["name"], insert_sql, pod=delayed_pod, host=delayed_pod, ns=kubectl.namespace)
        files_to_insert_from_metrics = clickhouse.query(
            chi["metadata"]["name"], "SELECT value FROM system.metrics WHERE metric='DistributedFilesToInsert'",
            pod=delayed_pod, ns=kubectl.namespace
        )
        files_to_insert_from_metrics = int(files_to_insert_from_metrics)

        files_to_insert_from_disk = int(kubectl.launch(
            f"exec -n {kubectl.namespace} {delayed_pod} -c clickhouse -- bash -c 'ls -la /var/lib/clickhouse/data/default/test_distr/*/*.bin 2>/dev/null | wc -l'",
            ok_to_fail=False,
        ))

    with When("reboot clickhouse-server pod"):
        fired = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", True,
                                 labels={"hostname": delayed_svc, "chi": chi["metadata"]["name"]})
        assert fired, error("can't get ClickHouseDistributedFilesToInsertHigh alert in firing state")

    kubectl.wait_pod_status(restarted_pod, "Running", ns=kubectl.namespace)

    clickhouse.query(
        chi["metadata"]["name"], 'SYSTEM START DISTRIBUTED SENDS default.test_distr',
        pod=delayed_pod, ns=kubectl.namespace
    )

    with Then("check ClickHouseDistributedFilesToInsertHigh gone away"):
        resolved = wait_alert_state("ClickHouseDistributedFilesToInsertHigh", "firing", False, labels={"hostname": delayed_svc})
        assert resolved, error("can't check ClickHouseDistributedFilesToInsertHigh alert is gone away")

    drop_distributed_table_on_cluster()
def test_clickhouse_dns_errors():
    random_idx = random.randint(0, 1)
    clickhouse_pod = chi["status"]["pods"][random_idx]
    clickhouse_svc = chi["status"]["fqdns"][random_idx]

    old_dns = kubectl.launch(
        f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- cat /etc/resolv.conf",
        ok_to_fail=False,
    )
    new_dns = re.sub(r'^nameserver (.+)', 'nameserver 1.1.1.1', old_dns)

    def rewrite_dns_on_clickhouse_server(write_new=True):
        dns = new_dns if write_new else old_dns
        kubectl.launch(
            f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- bash -c \"printf \\\"{dns}\\\" > /etc/resolv.conf\"",
            ok_to_fail=False,
        )
        kubectl.launch(
            f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- clickhouse-client --echo -mn -q \"SYSTEM DROP DNS CACHE; SELECT count() FROM cluster('all-sharded',system,metrics)\"",
            ok_to_fail=True,
        )

    with When("rewrite /etc/resolv.conf in clickhouse-server pod"):
        fired = wait_alert_state("ClickHouseDNSErrors", "firing", True, labels={"hostname": clickhouse_svc},
                                 time_range='20s', callback=rewrite_dns_on_clickhouse_server, sleep_time=5)
        assert fired, error("can't get ClickHouseDNSErrors alert in firing state")

    with Then("check ClickHouseDNSErrors gone away"):
        rewrite_dns_on_clickhouse_server(write_new=False)
        resolved = wait_alert_state("ClickHouseDNSErrors", "firing", False, labels={"hostname": clickhouse_svc})
        assert resolved, error("can't check ClickHouseDNSErrors alert is gone away")
Ejemplo n.º 18
0
def check_alert_state(alert_name,
                      prometheus_pod,
                      alert_state="firing",
                      labels=None,
                      time_range="10s"):
    with Then(
            f"check {alert_name} for state {alert_state} and {labels} labels in {time_range}"
    ):
        cmd = f"exec -n {settings.prometheus_namespace} {prometheus_pod} -c prometheus -- "
        cmd += "wget -qO- 'http://127.0.0.1:9090/api/v1/query?query=ALERTS{"
        if labels is None:
            labels = {}
        if not isinstance(labels, dict):
            fail(f"Invalid labels={labels}")
        labels.update({"alertname": alert_name, "alertstate": alert_state})
        cmd += ",".join(
            [f"{name}=\"{value}\"" for name, value in labels.items()])
        cmd += f"}}[{time_range}]' 2>/dev/null"
        out = kubectl.launch(cmd)
        out = json.loads(out)
        if not ("status" in out and out["status"] == "success"):
            fail("wrong response from prometheus query API")
        if len(out["data"]["result"]) == 0:
            with Then("not present, empty result"):
                return False
        result_labels = out["data"]["result"][0]["metric"].items()
        exists = all(item in result_labels for item in labels.items())
        with Then("got result and contains labels"
                  if exists else "got result, but doesn't contain labels"):
            return exists
def test_metrics_exporter_with_multiple_clickhouse_version():
    def check_monitoring_metrics(operator_namespace, operator_pod, expect_result, max_retries=10):
        with And(f"metrics-exporter /metrics enpoint result should match with {expect_result}"):
            for i in range(1, max_retries):
                out = kubectl.launch(
                    f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics",
                    ns=operator_namespace
                )
                all_strings_expected_done = True
                for string, exists in expect_result.items():
                    all_strings_expected_done = (exists == (string in out))
                    if not all_strings_expected_done:
                        break

                if all_strings_expected_done:
                    break
                with Then("Not ready. Wait for " + str(i * 5) + " seconds"):
                    time.sleep(i * 5)
            assert all_strings_expected_done, error()

    with Given("clickhouse-operator pod exists"):
        out = kubectl.launch("get pods -l app=clickhouse-operator", ns='kube-system').splitlines()[1]
        operator_pod = re.split(r'[\t\r\n\s]+', out)[0]
        operator_namespace = "kube-system"

        with Then("check empty /metrics"):
            kubectl.delete_ns(kubectl.namespace, ok_to_fail=True)
            kubectl.create_ns(kubectl.namespace)
            check_monitoring_metrics(operator_namespace, operator_pod, expect_result={
                'chi_clickhouse_metric_VersionInteger': False,
            })

        with Then("Install multiple clickhouse version"):
            config = util.get_full_path("configs/test-017-multi-version.yaml")
            kubectl.create_and_check(
                config=config,
                check={
                    "object_counts": {
                        "statefulset": 4,
                        "pod": 4,
                        "service": 5,
                    },
                    "do_not_delete": True,
                })
            with And("Check not empty /metrics"):
                check_monitoring_metrics(operator_namespace, operator_pod, expect_result={
                    '# HELP chi_clickhouse_metric_VersionInteger': True,
                    '# TYPE chi_clickhouse_metric_VersionInteger gauge': True,
                    'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-0-0': True,
                    'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-1-0': True,
                    'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-2-0': True,
                    'chi_clickhouse_metric_VersionInteger{chi="test-017-multi-version",hostname="chi-test-017-multi-version-default-3-0': True,

                })

        with Then("check empty /metrics after delete namespace"):
            kubectl.delete_ns(kubectl.namespace)
            check_monitoring_metrics(operator_namespace, operator_pod, expect_result={
                'chi_clickhouse_metric_VersionInteger': False,
            })
Ejemplo n.º 20
0
def exec_on_backup_container(backup_pod,
                             cmd,
                             ns=settings.test_namespace,
                             ok_to_fail=False,
                             timeout=60,
                             container='clickhouse-backup'):
    return kubectl.launch(f'exec -n {ns} {backup_pod} -c {container} -- {cmd}',
                          ok_to_fail=ok_to_fail,
                          timeout=timeout)
Ejemplo n.º 21
0
def query(
    chi_name,
    sql,
    with_error=False,
    host="127.0.0.1",
    port="9000",
    user="",
    pwd="",
    ns=settings.test_namespace,
    timeout=60,
    advanced_params="",
    pod="",
):
    pod_names = kubectl.get_pod_names(chi_name, ns)
    pod_name = pod_names[0]
    for p in pod_names:
        if host in p or p == pod:
            pod_name = p
            break

    pwd_str = "" if pwd == "" else f"--password={pwd}"
    user_str = "" if user == "" else f"--user={user}"

    if with_error:
        return kubectl.launch(
            f"exec {pod_name}"
            f" --"
            f" clickhouse-client -mn -h {host} --port={port} {user_str} {pwd_str} {advanced_params}"
            f" --query=\"{sql}\""
            f" 2>&1",
            timeout=timeout,
            ns=ns,
            ok_to_fail=True,
        )
    else:
        return kubectl.launch(
            f"exec {pod_name} -n {ns}"
            f" -- "
            f"clickhouse-client -mn -h {host} --port={port} {user_str} {pwd_str} {advanced_params}"
            f"--query=\"{sql}\"",
            timeout=timeout,
            ns=ns,
        )
 def check_monitoring_chi(operator_namespace, operator_pod, expect_result, max_retries=10):
     with Then(f"metrics-exporter /chi enpoint result should return {expect_result}"):
         for i in range(1, max_retries):
             # check /metrics for try to refresh monitored instances
             kubectl.launch(
                 f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics",
                 ns=operator_namespace
             )
             # check /chi after refresh monitored instances
             out = kubectl.launch(
                 f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/chi",
                 ns=operator_namespace
             )
             out = json.loads(out)
             if out == expect_result:
                 break
             with Then("Not ready. Wait for " + str(i * 5) + " seconds"):
                 time.sleep(i * 5)
         assert out == expect_result, error()
Ejemplo n.º 23
0
def set_operator_version(version, ns=settings.operator_namespace, timeout=60):
    operator_image = f"{settings.operator_docker_repo}:{version}"
    metrics_exporter_image = f"{settings.metrics_exporter_docker_repo}:{version}"
    kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator clickhouse-operator={operator_image}", ns=ns)
    kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator metrics-exporter={metrics_exporter_image}", ns=ns)
    kubectl.launch("rollout status deployment.v1.apps/clickhouse-operator", ns=ns, timeout=timeout)
    assert kubectl.get_count("pod", ns=ns, label="-l app=clickhouse-operator") > 0, error()
    def make_too_many_queries():
        long_cmd = ""
        for _ in range(90):
            port = random.choice(["8123", "3306", "9000"])
            if port == "9000":
                long_cmd += 'clickhouse-client -q "SELECT sleepEachRow(1),now() FROM numbers(60)";'
            if port == "3306":
                long_cmd += 'mysql -h 127.0.0.1 -P 3306 -u default -e "SELECT sleepEachRow(1),now() FROM numbers(60)";'
            if port == "8123":
                long_cmd += 'wget -qO- "http://127.0.0.1:8123?query=SELECT sleepEachRow(1),now() FROM numbers(60)";'

        long_cmd = f"echo '{long_cmd}' | xargs --verbose -i'{{}}' --no-run-if-empty -d ';' -P 100 bash -c '{{}}' 1>/dev/null"
        with open("/tmp/long_cmd.sh", "w") as f:
            f.write(long_cmd)

        kubectl.launch(
            f"cp /tmp/long_cmd.sh {too_many_queries_pod}:/tmp/long_cmd.sh -c clickhouse"
        )
        kubectl.launch(
            f"exec -n {kubectl.namespace} {too_many_queries_pod} -c clickhouse -- bash /tmp/long_cmd.sh",
            timeout=70,
        )
Ejemplo n.º 25
0
def set_operator_version(version, ns=settings.operator_namespace, timeout=60):
    operator_image = f"{settings.operator_docker_repo}:{version}"
    metrics_exporter_image = f"{settings.metrics_exporter_docker_repo}:{version}"
    kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator clickhouse-operator={operator_image}", ns=ns)
    kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator metrics-exporter={metrics_exporter_image}", ns=ns)
    kubectl.launch("rollout status deployment.v1.apps/clickhouse-operator", ns=ns, timeout=timeout)
    if kubectl.get_count("pod", ns=ns, label=operator_label) == 0:
        fail("invalid clickhouse-operator pod count")
    def check_monitoring_metrics(operator_namespace, operator_pod, expect_result, max_retries=10):
        with Then(f"metrics-exporter /metrics enpoint result should match with {expect_result}"):
            for i in range(1, max_retries):
                out = kubectl.launch(
                    f"exec {operator_pod} -c metrics-exporter -- wget -O- -q http://127.0.0.1:8888/metrics",
                    ns=operator_namespace
                )
                all_strings_expected_done = True
                for string, exists in expect_result.items():
                    all_strings_expected_done = (exists == (string in out))
                    if not all_strings_expected_done:
                        break

                if all_strings_expected_done:
                    break
                with Then("Not ready. Wait for " + str(i * 5) + " seconds"):
                    time.sleep(i * 5)
            assert all_strings_expected_done, error()
Ejemplo n.º 27
0
def test_ch_001(self):
    util.require_zookeeper()
    chit_data = manifest.get_chit_data(
        util.get_full_path("templates/tpl-clickhouse-19.11.yaml"))
    kubectl.launch(f"delete chit {chit_data['metadata']['name']}",
                   ns=settings.test_namespace)
    kubectl.create_and_check(
        "configs/test-ch-001-insert-quorum.yaml", {
            "apply_templates": {"templates/tpl-clickhouse-20.8.yaml"},
            "pod_count": 2,
            "do_not_delete": 1,
        })
    chi = manifest.get_chi_name(
        util.get_full_path("configs/test-ch-001-insert-quorum.yaml"))
    chi_data = kubectl.get("chi", ns=settings.test_namespace, name=chi)
    util.wait_clickhouse_cluster_ready(chi_data)

    host0 = "chi-test-ch-001-insert-quorum-default-0-0"
    host1 = "chi-test-ch-001-insert-quorum-default-0-1"

    create_table = """
    create table t1 on cluster default (a Int8, d Date default today())
    Engine = ReplicatedMergeTree('/clickhouse/tables/{table}', '{replica}')
    partition by d order by a 
    TTL d + interval 5 second
    SETTINGS merge_with_ttl_timeout=5""".replace('\r', '').replace('\n', '')

    create_mv_table2 = """
    create table t2 on cluster default (a Int8)
    Engine = ReplicatedMergeTree('/clickhouse/tables/{table}', '{replica}')
    partition by tuple() order by a""".replace('\r', '').replace('\n', '')

    create_mv_table3 = """
    create table t3 on cluster default (a Int8)
    Engine = ReplicatedMergeTree('/clickhouse/tables/{table}', '{replica}')
    partition by tuple() order by a""".replace('\r', '').replace('\n', '')

    create_mv2 = "create materialized view t_mv2 on cluster default to t2 as select a from t1"
    create_mv3 = "create materialized view t_mv3 on cluster default to t3 as select a from t1"

    with Given("Tables t1, t2, t3 and MVs t1->t2, t1-t3 are created"):
        clickhouse.query(chi, create_table)
        clickhouse.query(chi, create_mv_table2)
        clickhouse.query(chi, create_mv_table3)

        clickhouse.query(chi, create_mv2)
        clickhouse.query(chi, create_mv3)

        with When("Add a row to an old partition"):
            clickhouse.query(chi,
                             "insert into t1(a,d) values(6, today()-1)",
                             host=host0)

        with When("Stop fetches for t1 at replica1"):
            clickhouse.query(chi, "system stop fetches default.t1", host=host1)

            with Then("Wait 10 seconds and the data should be dropped by TTL"):
                time.sleep(10)
                out = clickhouse.query(chi,
                                       "select count() from t1 where a=6",
                                       host=host0)
                assert out == "0"

        with When("Resume fetches for t1 at replica1"):
            clickhouse.query(chi,
                             "system start fetches default.t1",
                             host=host1)
            time.sleep(5)

            with Then("Inserts should resume"):
                clickhouse.query(chi,
                                 "insert into t1(a) values(7)",
                                 host=host0)

        clickhouse.query(chi, "insert into t1(a) values(1)")

        with When("Stop fetches for t2 at replica1"):
            clickhouse.query(chi, "system stop fetches default.t2", host=host1)

            with Then("Insert should fail since it can not reach the quorum"):
                out = clickhouse.query_with_error(
                    chi, "insert into t1(a) values(2)", host=host0)
                assert "Timeout while waiting for quorum" in out

        # kubectl(f"exec {host0}-0 -n test -- cp /var/lib//clickhouse/data/default/t2/all_1_1_0/a.mrk2 /var/lib//clickhouse/data/default/t2/all_1_1_0/a.bin")
        # with Then("Corrupt data part in t2"):
        #    kubectl(f"exec {host0}-0 -n test -- sed -i \"s/b/c/\" /var/lib/clickhouse/data/default/t2/all_1_1_0/a.bin")

        with When("Resume fetches for t2 at replica1"):
            clickhouse.query(chi,
                             "system start fetches default.t2",
                             host=host1)
            i = 0
            while "2" != clickhouse.query(
                    chi,
                    "select active_replicas from system.replicas where database='default' and table='t1'",
                    pod=host0) and i < 10:
                with Then("Not ready, wait 5 seconds"):
                    time.sleep(5)
                    i += 1

            with Then(
                    "Inserts should fail with an error regarding not satisfied quorum"
            ):
                out = clickhouse.query_with_error(
                    chi, "insert into t1(a) values(3)", host=host0)
                assert "Quorum for previous write has not been satisfied yet" in out

            with And("Second insert of the same block should pass"):
                clickhouse.query(chi,
                                 "insert into t1(a) values(3)",
                                 host=host0)

            with And("Insert of the new block should fail"):
                out = clickhouse.query_with_error(
                    chi, "insert into t1(a) values(4)", host=host0)
                assert "Quorum for previous write has not been satisfied yet" in out

            with And(
                    "Second insert of the same block with 'deduplicate_blocks_in_dependent_materialized_views' setting should fail"
            ):
                out = clickhouse.query_with_error(
                    chi,
                    "set deduplicate_blocks_in_dependent_materialized_views=1; insert into t1(a) values(5)",
                    host=host0)
                assert "Quorum for previous write has not been satisfied yet" in out

        out = clickhouse.query_with_error(
            chi,
            "select t1.a t1_a, t2.a t2_a from t1 left outer join t2 using (a) order by t1_a settings join_use_nulls=1"
        )
        print(out)
 def restart_zookeeper():
     kubectl.launch(
         f"exec -n {kubectl.namespace} {zookeeper_pod} -- sh -c \"kill 1\"",
         ok_to_fail=True,
     )
def set_metrics_exporter_version(version, ns=settings.operator_namespace):
    kubectl.launch(f"set image deployment.v1.apps/clickhouse-operator metrics-exporter=altinity/metrics-exporter:{version}", ns=ns)
    kubectl.launch("rollout status deployment.v1.apps/clickhouse-operator", ns=ns)
 def reboot_clickhouse_server():
     kubectl.launch(
         f"exec -n {kubectl.namespace} {clickhouse_pod} -c clickhouse -- kill 1",
         ok_to_fail=True,
     )