def test_seekable_formats(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] # type: ClickHouseInstance table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')" instance.query( f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) settings s3_truncate_on_insert=1" ) result = instance.query(f"SELECT count() FROM {table_function}") assert (int(result) == 5000000) table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')" exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) settings s3_truncate_on_insert=1" ) result = instance.query(f"SELECT count() FROM {table_function}") assert (int(result) == 5000000) instance.query("SYSTEM FLUSH LOGS") result = instance.query( f"SELECT formatReadableSize(memory_usage) FROM system.query_log WHERE startsWith(query, 'SELECT count() FROM s3') AND memory_usage > 0 ORDER BY event_time desc" ) print(result[:3]) assert (int(result[:3]) < 200)
def test_in_memory_alters(start_cluster): def check_parts_type(parts_num): assert node9.query( "SELECT part_type, count() FROM system.parts WHERE table = 'alters_table' \ AND active GROUP BY part_type") == "InMemory\t{}\n".format( parts_num) node9.query( "INSERT INTO alters_table (date, id, s) VALUES (toDate('2020-10-10'), 1, 'ab'), (toDate('2020-10-10'), 2, 'cd')" ) node9.query("ALTER TABLE alters_table ADD COLUMN col1 UInt32") node9.restart_clickhouse(kill=True) expected = "1\tab\t0\n2\tcd\t0\n" assert node9.query( "SELECT id, s, col1 FROM alters_table ORDER BY id") == expected check_parts_type(1) # After hard restart table can be in readonly mode exec_query_with_retry( node9, "INSERT INTO alters_table (date, id, col1) VALUES (toDate('2020-10-10'), 3, 100)" ) node9.query("ALTER TABLE alters_table MODIFY COLUMN col1 String") node9.query("ALTER TABLE alters_table DROP COLUMN s") node9.restart_clickhouse(kill=True) check_parts_type(2) with pytest.raises(Exception): node9.query("SELECT id, s, col1 FROM alters_table") # Values of col1 was not materialized as integers, so they have # default string values after alter expected = "1\t_foo\n2\t_foo\n3\t100_foo\n" assert node9.query( "SELECT id, col1 || '_foo' FROM alters_table ORDER BY id") == expected
def test_seekable_formats_url(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] table_function = f"s3(s3_parquet, structure='a Int32, b String', format='Parquet')" instance.query( f"insert into table function {table_function} select number, randomString(100) from numbers(5000000) settings s3_truncate_on_insert=1" ) table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_parquet', 'Parquet', 'a Int32, b String')" result = instance.query(f"SELECT count() FROM {table_function}") assert (int(result) == 5000000) table_function = f"s3(s3_orc, structure='a Int32, b String', format='ORC')" exec_query_with_retry( instance, f"insert into table function {table_function} select number, randomString(100) from numbers(5000000) settings s3_truncate_on_insert=1" ) table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_orc', 'ORC', 'a Int32, b String')" result = instance.query(f"SELECT count() FROM {table_function}") assert (int(result) == 5000000) instance.query("SYSTEM FLUSH LOGS") result = instance.query( f"SELECT formatReadableSize(memory_usage) FROM system.query_log WHERE startsWith(query, 'SELECT count() FROM url') AND memory_usage > 0 ORDER BY event_time desc" ) print(result[:3]) assert (int(result[:3]) < 200)
def test_upgrade_while_mutation(start_cluster): node3.query("DROP TABLE IF EXISTS mt1") node3.query( "CREATE TABLE mt1 (EventDate Date, id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t1', 'node3') ORDER BY tuple()" ) node3.query( "INSERT INTO mt1 select '2020-02-13', number from numbers(100000)") node3.query("SYSTEM STOP MERGES mt1") node3.query("ALTER TABLE mt1 DELETE WHERE id % 2 == 0") node3.query("DETACH TABLE mt1") # stop being leader node3.restart_with_latest_version(signal=9, fix_metadata=True) # checks for readonly exec_query_with_retry(node3, "OPTIMIZE TABLE mt1", sleep_time=5, retry_count=60) node3.query("ALTER TABLE mt1 DELETE WHERE id > 100000", settings={"mutations_sync": "2"}) # will delete nothing, but previous async mutation will finish with this query assert_eq_with_retry(node3, "SELECT COUNT() from mt1", "50000\n") node3.query("DROP TABLE mt1")
def test_mutate_and_upgrade(start_cluster): for node in [node1, node2]: node.query("DROP TABLE IF EXISTS mt") node.query( "CREATE TABLE mt (EventDate Date, id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t', '{}') ORDER BY tuple()" .format(node.name)) node1.query("INSERT INTO mt VALUES ('2020-02-13', 1), ('2020-02-13', 2);") node1.query("ALTER TABLE mt DELETE WHERE id = 2", settings={"mutations_sync": "2"}) node2.query("SYSTEM SYNC REPLICA mt", timeout=15) node2.query("DETACH TABLE mt") # stop being leader node1.query("DETACH TABLE mt") # stop being leader node1.restart_with_latest_version(signal=9, fix_metadata=True) node2.restart_with_latest_version(signal=9, fix_metadata=True) # After hard restart table can be in readonly mode exec_query_with_retry(node2, "INSERT INTO mt VALUES ('2020-02-13', 3)", retry_count=60) exec_query_with_retry(node1, "SYSTEM SYNC REPLICA mt", retry_count=60) assert node1.query("SELECT COUNT() FROM mt") == "2\n" assert node2.query("SELECT COUNT() FROM mt") == "2\n" node1.query("INSERT INTO mt VALUES ('2020-02-13', 4);") node2.query("SYSTEM SYNC REPLICA mt", timeout=15) assert node1.query("SELECT COUNT() FROM mt") == "3\n" assert node2.query("SELECT COUNT() FROM mt") == "3\n" node2.query("ALTER TABLE mt DELETE WHERE id = 3", settings={"mutations_sync": "2"}) node1.query("SYSTEM SYNC REPLICA mt", timeout=15) assert node1.query("SELECT COUNT() FROM mt") == "2\n" assert node2.query("SELECT COUNT() FROM mt") == "2\n" node1.query( "ALTER TABLE mt MODIFY COLUMN id Int32 DEFAULT 0", settings={"replication_alter_partitions_sync": "2"}, ) node2.query("OPTIMIZE TABLE mt FINAL") assert node1.query("SELECT id FROM mt") == "1\n4\n" assert node2.query("SELECT id FROM mt") == "1\n4\n" for node in [node1, node2]: node.query("DROP TABLE mt")
def test_in_memory_deduplication(start_cluster): for i in range(3): # table can be in readonly node exec_query_with_retry( node9, "INSERT INTO deduplication_table (date, id, s) VALUES (toDate('2020-03-03'), 1, 'foo')" ) exec_query_with_retry( node10, "INSERT INTO deduplication_table (date, id, s) VALUES (toDate('2020-03-03'), 1, 'foo')" ) node9.query("SYSTEM SYNC REPLICA deduplication_table", timeout=20) node10.query("SYSTEM SYNC REPLICA deduplication_table", timeout=20) assert node9.query("SELECT date, id, s FROM deduplication_table" ) == "2020-03-03\t1\tfoo\n" assert node10.query("SELECT date, id, s FROM deduplication_table" ) == "2020-03-03\t1\tfoo\n"
def test_upgrade_while_mutation(start_cluster): node3.query( "CREATE TABLE mt1 (EventDate Date, id UInt64) ENGINE ReplicatedMergeTree('/clickhouse/tables/t1', 'node3') ORDER BY tuple()" ) node3.query( "INSERT INTO mt1 select '2020-02-13', number from numbers(100000)") node3.query("SYSTEM STOP MERGES mt1") node3.query("ALTER TABLE mt1 DELETE WHERE id % 2 == 0") node3.restart_with_latest_version(signal=9) exec_query_with_retry(node3, "ALTER TABLE mt1 DELETE WHERE id > 100000", settings={"mutations_sync": "2"}) # will delete nothing, but previous async mutation will finish with this query assert_eq_with_retry(node3, "SELECT COUNT() from mt1", "50000\n")
def test_ttl_compatibility(started_cluster, node_left, node_right, num_run): drop_table([node_left, node_right], "test_ttl_delete") drop_table([node_left, node_right], "test_ttl_group_by") drop_table([node_left, node_right], "test_ttl_where") for node in [node_left, node_right]: node.query(''' CREATE TABLE test_ttl_delete(date DateTime, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_delete_{suff}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) TTL date + INTERVAL 3 SECOND SETTINGS max_number_of_merges_with_ttl_in_pool=100, max_replicated_merges_with_ttl_in_queue=100 '''.format(suff=num_run, replica=node.name)) node.query(''' CREATE TABLE test_ttl_group_by(date DateTime, id UInt32, val UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_group_by_{suff}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) TTL date + INTERVAL 3 SECOND GROUP BY id SET val = sum(val) SETTINGS max_number_of_merges_with_ttl_in_pool=100, max_replicated_merges_with_ttl_in_queue=100 '''.format(suff=num_run, replica=node.name)) node.query(''' CREATE TABLE test_ttl_where(date DateTime, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_where_{suff}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) TTL date + INTERVAL 3 SECOND DELETE WHERE id % 2 = 1 SETTINGS max_number_of_merges_with_ttl_in_pool=100, max_replicated_merges_with_ttl_in_queue=100 '''.format(suff=num_run, replica=node.name)) node_left.query("INSERT INTO test_ttl_delete VALUES (now(), 1)") node_left.query( "INSERT INTO test_ttl_delete VALUES (toDateTime('2100-10-11 10:00:00'), 2)" ) node_right.query("INSERT INTO test_ttl_delete VALUES (now(), 3)") node_right.query( "INSERT INTO test_ttl_delete VALUES (toDateTime('2100-10-11 10:00:00'), 4)" ) node_left.query("INSERT INTO test_ttl_group_by VALUES (now(), 0, 1)") node_left.query("INSERT INTO test_ttl_group_by VALUES (now(), 0, 2)") node_right.query("INSERT INTO test_ttl_group_by VALUES (now(), 0, 3)") node_right.query("INSERT INTO test_ttl_group_by VALUES (now(), 0, 4)") node_left.query("INSERT INTO test_ttl_where VALUES (now(), 1)") node_left.query("INSERT INTO test_ttl_where VALUES (now(), 2)") node_right.query("INSERT INTO test_ttl_where VALUES (now(), 3)") node_right.query("INSERT INTO test_ttl_where VALUES (now(), 4)") if node_left.with_installed_binary: node_left.restart_with_latest_version() if node_right.with_installed_binary: node_right.restart_with_latest_version() time.sleep(5) # Wait for TTL # after restart table can be in readonly mode exec_query_with_retry(node_right, "OPTIMIZE TABLE test_ttl_delete FINAL") node_right.query("OPTIMIZE TABLE test_ttl_group_by FINAL") node_right.query("OPTIMIZE TABLE test_ttl_where FINAL") exec_query_with_retry(node_left, "OPTIMIZE TABLE test_ttl_delete FINAL") node_left.query("OPTIMIZE TABLE test_ttl_group_by FINAL", timeout=20) node_left.query("OPTIMIZE TABLE test_ttl_where FINAL", timeout=20) # After OPTIMIZE TABLE, it is not guaranteed that everything is merged. # Possible scenario (for test_ttl_group_by): # 1. Two independent merges assigned: [0_0, 1_1] -> 0_1 and [2_2, 3_3] -> 2_3 # 2. Another one merge assigned: [0_1, 2_3] -> 0_3 # 3. Merge to 0_3 is delayed: # `Not executing log entry for part 0_3 because 2 merges with TTL already executing, maximum 2 # 4. OPTIMIZE FINAL does nothing, cause there is an entry for 0_3 # # So, let's also sync replicas for node_right (for now). exec_query_with_retry(node_right, "SYSTEM SYNC REPLICA test_ttl_delete") node_right.query("SYSTEM SYNC REPLICA test_ttl_group_by", timeout=20) node_right.query("SYSTEM SYNC REPLICA test_ttl_where", timeout=20) exec_query_with_retry(node_left, "SYSTEM SYNC REPLICA test_ttl_delete") node_left.query("SYSTEM SYNC REPLICA test_ttl_group_by", timeout=20) node_left.query("SYSTEM SYNC REPLICA test_ttl_where", timeout=20) assert node_left.query( "SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n" assert node_right.query( "SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n" assert node_left.query( "SELECT val FROM test_ttl_group_by ORDER BY id") == "10\n" assert node_right.query( "SELECT val FROM test_ttl_group_by ORDER BY id") == "10\n" assert node_left.query( "SELECT id FROM test_ttl_where ORDER BY id") == "2\n4\n" assert node_right.query( "SELECT id FROM test_ttl_where ORDER BY id") == "2\n4\n"
def test_ttl_compatibility(started_cluster, node_left, node_right, num_run): drop_table([node_left, node_right], "test_ttl_delete") drop_table([node_left, node_right], "test_ttl_group_by") drop_table([node_left, node_right], "test_ttl_where") for node in [node_left, node_right]: node.query( ''' CREATE TABLE test_ttl_delete(date DateTime, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_delete_{suff}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) TTL date + INTERVAL 3 SECOND '''.format(suff=num_run, replica=node.name)) node.query( ''' CREATE TABLE test_ttl_group_by(date DateTime, id UInt32, val UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_group_by_{suff}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) TTL date + INTERVAL 3 SECOND GROUP BY id SET val = sum(val) '''.format(suff=num_run, replica=node.name)) node.query( ''' CREATE TABLE test_ttl_where(date DateTime, id UInt32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/test_ttl_where_{suff}', '{replica}') ORDER BY id PARTITION BY toDayOfMonth(date) TTL date + INTERVAL 3 SECOND DELETE WHERE id % 2 = 1 '''.format(suff=num_run, replica=node.name)) node_left.query("INSERT INTO test_ttl_delete VALUES (now(), 1)") node_left.query("INSERT INTO test_ttl_delete VALUES (toDateTime('2100-10-11 10:00:00'), 2)") node_right.query("INSERT INTO test_ttl_delete VALUES (now(), 3)") node_right.query("INSERT INTO test_ttl_delete VALUES (toDateTime('2100-10-11 10:00:00'), 4)") node_left.query("INSERT INTO test_ttl_group_by VALUES (now(), 0, 1)") node_left.query("INSERT INTO test_ttl_group_by VALUES (now(), 0, 2)") node_right.query("INSERT INTO test_ttl_group_by VALUES (now(), 0, 3)") node_right.query("INSERT INTO test_ttl_group_by VALUES (now(), 0, 4)") node_left.query("INSERT INTO test_ttl_where VALUES (now(), 1)") node_left.query("INSERT INTO test_ttl_where VALUES (now(), 2)") node_right.query("INSERT INTO test_ttl_where VALUES (now(), 3)") node_right.query("INSERT INTO test_ttl_where VALUES (now(), 4)") if node_left.with_installed_binary: node_left.restart_with_latest_version() if node_right.with_installed_binary: node_right.restart_with_latest_version() time.sleep(5) # Wait for TTL # after restart table can be in readonly mode exec_query_with_retry(node_right, "OPTIMIZE TABLE test_ttl_delete FINAL") node_right.query("OPTIMIZE TABLE test_ttl_group_by FINAL") node_right.query("OPTIMIZE TABLE test_ttl_where FINAL") exec_query_with_retry(node_left, "SYSTEM SYNC REPLICA test_ttl_delete") node_left.query("SYSTEM SYNC REPLICA test_ttl_group_by", timeout=20) node_left.query("SYSTEM SYNC REPLICA test_ttl_where", timeout=20) assert node_left.query("SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n" assert node_right.query("SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n" assert node_left.query("SELECT val FROM test_ttl_group_by ORDER BY id") == "10\n" assert node_right.query("SELECT val FROM test_ttl_group_by ORDER BY id") == "10\n" assert node_left.query("SELECT id FROM test_ttl_where ORDER BY id") == "2\n4\n" assert node_right.query("SELECT id FROM test_ttl_where ORDER BY id") == "2\n4\n"