def populate(c: Composition) -> None: # Create some database objects c.testdrive( dedent(""" > CREATE TABLE t1 (f1 INTEGER); > INSERT INTO t1 SELECT * FROM generate_series(1, 10); > CREATE MATERIALIZED VIEW v1 AS SELECT COUNT(*) AS c1 FROM t1; > CREATE TABLE ten (f1 INTEGER); > INSERT INTO ten SELECT * FROM generate_series(1, 10); > CREATE MATERIALIZED VIEW expensive AS SELECT (a1.f1 * 1) + (a2.f1 * 10) + (a3.f1 * 100) + (a4.f1 * 1000) + (a5.f1 * 10000) + (a6.f1 * 100000) + (a7.f1 * 1000000) FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4, ten AS a5, ten AS a6, ten AS a7; $ kafka-create-topic topic=source1 $ kafka-ingest format=bytes topic=source1 repeat=1000000 A${kafka-ingest.iteration} > CREATE SOURCE source1 FROM KAFKA BROKER '${testdrive.kafka-addr}' TOPIC 'testdrive-source1-${testdrive.seed}' FORMAT BYTES > CREATE MATERIALIZED VIEW v2 AS SELECT COUNT(*) FROM source1 """), )
def run(self, c: Composition) -> None: if not self.new_sink: return c.testdrive( dedent(f""" > CREATE CONNECTION IF NOT EXISTS {self.sink.name}_kafka_conn FOR KAFKA BROKER '${{testdrive.kafka-addr}}'; > CREATE CONNECTION IF NOT EXISTS {self.sink.name}_csr_conn FOR CONFLUENT SCHEMA REGISTRY URL '${{testdrive.schema-registry-url}}'; > CREATE SINK {self.sink.name} FROM {self.source_view.name} INTO KAFKA CONNECTION {self.sink.name}_kafka_conn TOPIC 'sink-{self.sink.name}' FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION {self.sink.name}_csr_conn; # Ingest the sink again in order to be able to validate its contents > CREATE SOURCE {self.sink.name}_source FROM KAFKA CONNECTION {self.sink.name}_kafka_conn TOPIC 'sink-{self.sink.name}' FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION {self.sink.name}_csr_conn ENVELOPE NONE # The sink-dervied source has upsert semantics, so produce a "normal" ViewExists output # from the 'before' and the 'after' > CREATE MATERIALIZED VIEW {self.dest_view.name} AS SELECT SUM(min)::int AS min, SUM(max)::int AS max, SUM(c1)::int AS c1, SUM(c2)::int AS c2 FROM ( SELECT (after).min, (after).max, (after).c1, (after).c2 FROM {self.sink.name}_source UNION ALL SELECT - (before).min, - (before).max, -(before).c1, -(before).c2 FROM {self.sink.name}_source ); """))
def create_invalid_replica(c: Composition) -> None: c.testdrive( dedent( """ > CREATE CLUSTER REPLICA cluster1.replica3 REMOTE ['no_such_host:2100'] """ ) )
def drop_create_replica(c: Composition) -> None: c.testdrive( dedent( """ > DROP CLUSTER REPLICA cluster1.replica1 > CREATE CLUSTER REPLICA cluster1.replica3 REMOTE ['computed_1_1:2100', 'computed_1_2:2100'] """ ) )
def run(self, c: Composition) -> None: if self.new_source: c.testdrive(f""" > CREATE MATERIALIZED SOURCE {self.source.name} FROM KAFKA BROKER '${{testdrive.kafka-addr}}' TOPIC 'testdrive-{self.topic.name}-${{testdrive.seed}}' FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY '${{testdrive.schema-registry-url}}' ENVELOPE UPSERT """)
def run(self, c: Composition) -> None: if self.new_topic: c.testdrive(f""" $ kafka-create-topic topic={self.topic.name} partitions={self.topic.partitions} {SCHEMA} $ kafka-ingest format=avro key-format=avro topic={self.topic.name} schema=${{schema}} key-schema=${{keyschema}} repeat=1 {{"key": 0}} {{"f1": 0}} """)
def run(self, c: Composition) -> None: watermarks = self.view.get_watermarks() view_min = watermarks.min view_max = watermarks.max if view_min <= view_max: c.testdrive(f""" > SELECT * FROM {self.view.name} /* {view_min} {view_max} {(view_max-view_min)+1} {(view_max-view_min)+1} */ ; {view_min} {view_max} {(view_max-view_min)+1} {(view_max-view_min)+1} """)
def run(self, c: Composition) -> None: prev_high = self.topic.watermarks.high self.topic.watermarks.high = prev_high + self.delta assert self.topic.watermarks.high >= 0 assert self.topic.watermarks.low >= 0 c.testdrive(f""" {SCHEMA} $ kafka-ingest format=avro key-format=avro topic={self.topic.name} schema=${{schema}} key-schema=${{keyschema}} start-iteration={prev_high + 1} publish=true repeat={self.delta} {{"f1": ${{kafka-ingest.iteration}}}} {{"f2": ${{kafka-ingest.iteration}}}} """)
def run(self, c: Composition) -> None: prev_max = self.topic.watermarks.max self.topic.watermarks.max = prev_max + self.delta assert self.topic.watermarks.max >= 0 assert self.topic.watermarks.min >= 0 c.testdrive(f""" {SCHEMA} $ kafka-ingest format=avro key-format=avro topic={self.topic.name} schema=${{schema}} key-schema=${{keyschema}} start-iteration={prev_max + 1} repeat={self.delta} {{"key": ${{kafka-ingest.iteration}}}} {{"f1": ${{kafka-ingest.iteration}}}} """)
def run(self, c: Composition) -> None: froms: WatermarkedObjects = self.view.froms low = max([f.get_watermarks().low for f in froms]) high = min([f.get_watermarks().high for f in froms]) if low <= high: c.testdrive( f""" > SELECT * FROM {self.view.name}; {low} {high} {(high-low)+1} {(high-low)+1} """ )
def run(self, c: Composition) -> None: if self.new_source: envelope = str(self.topic.envelope).split(".")[1] c.testdrive(f""" > CREATE CONNECTION IF NOT EXISTS {self.source.name}_csr_conn FOR CONFLUENT SCHEMA REGISTRY URL '${{testdrive.schema-registry-url}}'; > CREATE SOURCE {self.source.name} FROM KAFKA BROKER '${{testdrive.kafka-addr}}' TOPIC 'testdrive-{self.topic.name}-${{testdrive.seed}}' FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION {self.source.name}_csr_conn ENVELOPE {envelope} """)
def workflow_test_builtin_migration(c: Composition) -> None: """Exercise the builtin object migration code by upgrading between two versions that will have a migration triggered between them. Create a materialized view over the affected builtin object to confirm that the migration was successful """ c.down(destroy_volumes=True) with c.override( # Random commit before pg_roles was updated. Materialized( image= "materialize/materialized:devel-9efd269199b1510b3e8f90196cb4fa3072a548a1", ), Testdrive(default_timeout="15s", no_reset=True, consistent_seed=True), ): c.up("testdrive", persistent=True) c.up("materialized") c.wait_for_materialized() c.testdrive(input=dedent(""" > CREATE VIEW v1 AS SELECT COUNT(*) FROM pg_roles; > SELECT * FROM v1; 2 ! SELECT DISTINCT rolconnlimit FROM pg_roles; contains:column "rolconnlimit" does not exist """)) c.kill("materialized") with c.override( # This will stop working if we introduce a breaking change. Materialized(), Testdrive(default_timeout="15s", no_reset=True, consistent_seed=True), ): c.up("testdrive", persistent=True) c.up("materialized") c.wait_for_materialized() c.testdrive(input=dedent(""" > SELECT * FROM v1; 2 # This column is new after the migration > SELECT DISTINCT rolconnlimit FROM pg_roles; -1 """))
def run(self, c: Composition) -> None: prev_low = self.topic.watermarks.low self.topic.watermarks.low = min(prev_low + self.delta, self.topic.watermarks.high) assert self.topic.watermarks.high >= 0 assert self.topic.watermarks.low >= 0 actual_delta = self.topic.watermarks.low - prev_low if actual_delta > 0: c.testdrive(f""" {SCHEMA} $ kafka-ingest format=avro topic={self.topic.name} key-format=avro key-schema=${{keyschema}} schema=${{schema}} start-iteration={prev_low} publish=true repeat={actual_delta} {{"f1": ${{kafka-ingest.iteration}}}} """)
def populate(c: Composition) -> None: # Create some database objects c.testdrive( """ > SET cluster=cluster1 > DROP TABLE IF EXISTS t1 CASCADE; > CREATE TABLE t1 (f1 TEXT); > INSERT INTO t1 VALUES (1), (2); > CREATE VIEW v1 AS SELECT COUNT(*) AS c1 FROM t1; > CREATE DEFAULT INDEX i1 IN CLUSTER cluster2 ON v1; """, )
def run(self, c: Composition) -> None: if self.topic.envelope is Envelope.NONE: return prev_min = self.topic.watermarks.min self.topic.watermarks.min = min(prev_min + self.delta, self.topic.watermarks.max) assert self.topic.watermarks.max >= 0 assert self.topic.watermarks.min >= 0 actual_delta = self.topic.watermarks.min - prev_min if actual_delta > 0: c.testdrive(f""" {SCHEMA} $ kafka-ingest format=avro topic={self.topic.name} key-format=avro key-schema=${{keyschema}} schema=${{schema}} start-iteration={prev_min} repeat={actual_delta} {{"key": ${{kafka-ingest.iteration}}}} """)
def validate(c: Composition) -> None: # Validate that the cluster continues to operate c.testdrive( dedent( """ # Dataflows > SELECT * FROM v1; 10 # Existing sources $ kafka-ingest format=bytes topic=source1 repeat=1000000 B${kafka-ingest.iteration} > SELECT * FROM v2; 2000000 # Existing tables > INSERT INTO t1 VALUES (20); > SELECT * FROM v1; 11 # New materialized views > CREATE MATERIALIZED VIEW v3 AS SELECT COUNT(*) AS c1 FROM t1; > SELECT * FROM v3; 11 # New tables > CREATE TABLE t2 (f1 INTEGER); > INSERT INTO t2 SELECT * FROM t1; > SELECT COUNT(*) FROM t2; 11 # New sources > CREATE MATERIALIZED SOURCE source2 FROM KAFKA BROKER '${testdrive.kafka-addr}' TOPIC 'testdrive-source1-${testdrive.seed}' FORMAT BYTES > SELECT COUNT(*) FROM source2 2000000 """ ), )
def run(self, c: Composition) -> None: if not self.new_view: return some_from = random.sample(self.view.froms, 1)[0] outer_join = "\n ".join( f"JOIN {f.name} USING (f1)" for f in self.view.froms[1:] ) c.testdrive( f""" > CREATE MATERIALIZED VIEW {self.view.name} AS SELECT MIN({some_from.name}.f1), MAX({some_from.name}.f1), COUNT({some_from.name}.f1) AS c1, COUNT(DISTINCT {some_from.name}.f1) AS c2 FROM {self.view.froms[0].name} {outer_join}; """ )
def run(self, c: Composition) -> None: self.table.watermarks.low = min(self.table.watermarks.low + self.delta, self.table.watermarks.high) c.testdrive( f"> DELETE FROM {self.table.name} WHERE f1 < {self.table.watermarks.low};" )
def validate(c: Composition) -> None: # Validate that cluster2 continues to operate c.testdrive( """ # Dataflows $ set-regex match=\d{13} replacement=<TIMESTAMP> > SET cluster=cluster2 > SELECT * FROM v1; 2 # Tables > INSERT INTO t1 VALUES (3); > SELECT * FROM t1; 1 2 3 # Introspection tables > SHOW CLUSTERS LIKE 'cluster2' cluster2 > SELECT name FROM mz_tables WHERE name = 't1'; t1 # DDL statements > CREATE MATERIALIZED VIEW v2 AS SELECT COUNT(*) AS c1 FROM t1; > SELECT * FROM v2; 3 > CREATE MATERIALIZED VIEW v1mat AS SELECT * FROM v1; > CREATE INDEX i2 IN CLUSTER cluster2 ON t1 (f1); > SELECT f1 FROM t1; 1 2 3 # Tables > CREATE TABLE t2 (f1 INTEGER); > INSERT INTO t2 VALUES (1); > INSERT INTO t2 SELECT * FROM t2; > SELECT * FROM t2; 1 1 # Sources > CREATE CONNECTION IF NOT EXISTS kafka_conn FOR KAFKA BROKER '${testdrive.kafka-addr}'; > CREATE CONNECTION IF NOT EXISTS csr_conn FOR CONFLUENT SCHEMA REGISTRY URL '${testdrive.schema-registry-url}'; $ kafka-create-topic topic=source1 partitions=1 $ kafka-ingest format=bytes topic=source1 A > CREATE SOURCE source1 FROM KAFKA CONNECTION kafka_conn TOPIC 'testdrive-source1-${testdrive.seed}' FORMAT BYTES > SELECT * FROM source1 A # Sinks > CREATE SINK sink1 FROM v1mat INTO KAFKA CONNECTION kafka_conn TOPIC 'sink1' FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY CONNECTION csr_conn $ kafka-verify format=avro sink=materialize.public.sink1 sort-messages=true {"before": null, "after": {"row":{"c1": 3}}, "transaction": {"id": "<TIMESTAMP>"}} """, )
def workflow_instance_size(c: Composition, parser: WorkflowArgumentParser) -> None: """Create multiple clusters with multiple nodes and replicas each""" c.start_and_wait_for_tcp( services=["zookeeper", "kafka", "schema-registry"]) parser.add_argument( "--workers", type=int, metavar="N", default=2, help="set the default number of workers", ) parser.add_argument( "--clusters", type=int, metavar="N", default=16, help="set the number of clusters to create", ) parser.add_argument( "--nodes", type=int, metavar="N", default=4, help="set the number of nodes per cluster", ) parser.add_argument( "--replicas", type=int, metavar="N", default=4, help="set the number of replicas per cluster", ) args = parser.parse_args() c.up("testdrive", persistent=True) c.up("materialized") c.wait_for_materialized() # Construct the requied Computed instances and peer them into clusters computeds = [] for cluster_id in range(0, args.clusters): for replica_id in range(0, args.replicas): nodes = [] for node_id in range(0, args.nodes): node_name = f"computed_{cluster_id}_{replica_id}_{node_id}" nodes.append(node_name) for node_id in range(0, args.nodes): computeds.append( Computed(name=nodes[node_id], peers=nodes, workers=args.workers)) with c.override(*computeds): with c.override(Testdrive(seed=1, no_reset=True)): for n in computeds: c.up(n.name) # Create some input data c.testdrive( dedent(""" > CREATE TABLE ten (f1 INTEGER); > INSERT INTO ten VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9); $ set schema={ "type" : "record", "name" : "test", "fields" : [ {"name":"f1", "type":"string"} ] } $ kafka-create-topic topic=instance-size $ kafka-ingest format=avro topic=instance-size schema=${schema} publish=true repeat=10000 {"f1": "fish"} """)) # Construct the required CREATE CLUSTER statements for cluster_id in range(0, args.clusters): replica_definitions = [] for replica_id in range(0, args.replicas): nodes = [] for node_id in range(0, args.nodes): node_name = f"computed_{cluster_id}_{replica_id}_{node_id}" nodes.append(node_name) replica_name = f"replica_{cluster_id}_{replica_id}" replica_definitions.append(f"{replica_name} (REMOTE [" + ", ".join(f"'{n}:2100'" for n in nodes) + "])") c.sql(f"CREATE CLUSTER cluster_{cluster_id} REPLICAS (" + ",".join(replica_definitions) + ")") # Construct some dataflows in each cluster for cluster_id in range(0, args.clusters): cluster_name = f"cluster_{cluster_id}" c.testdrive( dedent(f""" > SET cluster={cluster_name} > CREATE DEFAULT INDEX ON ten; > CREATE MATERIALIZED VIEW v_{cluster_name} AS SELECT COUNT(*) AS c1 FROM ten AS a1, ten AS a2, ten AS a3, ten AS a4; > CREATE MATERIALIZED SOURCE s_{cluster_name} FROM KAFKA BROKER '${{testdrive.kafka-addr}}' TOPIC 'testdrive-instance-size-${{testdrive.seed}}' FORMAT AVRO USING CONFLUENT SCHEMA REGISTRY '${{testdrive.schema-registry-url}}' ENVELOPE NONE """)) # Validate that each individual cluster is operating properly for cluster_id in range(0, args.clusters): cluster_name = f"cluster_{cluster_id}" c.testdrive( dedent(f""" > SET cluster={cluster_name} > SELECT c1 FROM v_{cluster_name}; 10000 > SELECT COUNT(*) FROM s_{cluster_name} 10000 """))
def run(self, c: Composition) -> None: c.testdrive(f""" > SELECT MIN(f1), MAX(f1), COUNT(f1), COUNT(DISTINCT f1) FROM {self.table.name}; {self.table.watermarks.low} {self.table.watermarks.high} {(self.table.watermarks.high-self.table.watermarks.low)+1} {(self.table.watermarks.high-self.table.watermarks.low)+1} """)
def run(self, c: Composition) -> None: prev_high = self.table.watermarks.high self.table.watermarks.high = prev_high + self.delta c.testdrive( f"> INSERT INTO {self.table.name} SELECT * FROM generate_series({prev_high + 1}, {self.table.watermarks.high});" )
def run(self, c: Composition) -> None: self.table.watermarks.shift(-self.delta) c.testdrive(f"> UPDATE {self.table.name} SET f1 = f1 - {self.delta};")
def run(self, c: Composition) -> None: self.table.watermarks.high = max( self.table.watermarks.high - self.delta, self.table.watermarks.low) c.testdrive( f"> DELETE FROM {self.table.name} WHERE f1 > {self.table.watermarks.high};" )
def run(self, c: Composition) -> None: if self.new_table: c.testdrive(f""" > CREATE TABLE {self.table.name} (f1 INTEGER); > INSERT INTO {self.table.name} VALUES ({self.table.watermarks.high}); """)
def execute(self, c: Composition) -> None: c.testdrive(input=self.td_str)