def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "kafka" ) as docker_services: wait_for_port(docker_services, "test_broker", 59092, timeout=120) wait_for_port(docker_services, "test_schema_registry", 8081, timeout=120) # Set up topics and produce some data command = f"{test_resources_dir}/send_records.sh {test_resources_dir}" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "kafka_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "kafka_mces.json", golden_path=test_resources_dir / "kafka_mces_golden.json", ignore_paths=[], )
def test_mysql_ingest_with_db_alias( docker_compose_runner, pytestconfig, tmp_path, mock_time ): test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "mysql" ) as docker_services: wait_for_port(docker_services, "testmysql", 3306) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "mysql_to_file_dbalias.yml").resolve() run_datahub_cmd( ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path ) # Verify the output. # Assert that all events generated have instance specific urns import re urn_pattern = "^" + re.escape( "urn:li:dataset:(urn:li:dataPlatform:mysql,foogalaxy." ) mce_helpers.assert_mcp_entity_urn( filter="ALL", entity_type="dataset", regex_pattern=urn_pattern, file=tmp_path / "mysql_mces_dbalias.json", )
def test_mssql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/sql_server" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "sql-server") as docker_services: # Wait for SQL Server to be ready. We wait an extra couple seconds, as the port being available # does not mean the server is accepting connections. # TODO: find a better way to check for liveness. wait_for_port(docker_services, "testsqlserver", 1433) time.sleep(5) # Run the setup.sql file to populate the database. docker = "docker" command = f"{docker} exec testsqlserver /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P 'test!Password' -d master -i /setup/setup.sql" ret = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) assert ret.returncode == 0 # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "mssql_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path, check_result=True) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "mssql_mces.json", golden_path=test_resources_dir / "mssql_mces_golden.json", )
def test_hive_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/hive" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "hive") as docker_services: wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) # Set up the container. command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "hive_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "hive_mces.json", golden_path=test_resources_dir / "hive_mces_golden.json", ignore_paths=[ # example: root[1]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['CreateTime:'] # example: root[2]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['Table Parameters: transient_lastDdlTime'] r"root\[\d+\]\['proposedSnapshot'\]\['com\.linkedin\.pegasus2avro\.metadata\.snapshot\.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com\.linkedin\.pegasus2avro\.dataset\.DatasetProperties'\]\['customProperties'\]\['.*Time.*'\]", r"root\[6\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.schema.SchemaMetadata'\]\['fields'\]\[\d+\]\['nativeDataType'\]", ], )
def test_openapi_ingest(pytestconfig, tmp_path): test_resources_dir = pytestconfig.rootpath / "tests/integration/openapi" # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "openapi_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="/tmp/openapi_mces.json", golden_path=test_resources_dir / "openapi_mces_golden.json", )
def test_trino_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/trino" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "trino") as docker_services: wait_for_port(docker_services, "testtrino", 8080) wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) docker_services.wait_until_responsive( timeout=30, pause=1, check=lambda: requests.get("http://localhost:5300/v1/info").json()[ "starting"] is False, ) # Set up the hive db command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. with fs_helpers.isolated_filesystem(tmp_path): # Run the metadata ingestion pipeline for trino catalog referring to postgres database config_file = (test_resources_dir / "trino_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"]) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_mces.json", golden_path=test_resources_dir / "trino_mces_golden.json", ) # Limitation 1 - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database. # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070 # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino. # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html # Run the metadata ingestion pipeline for trino catalog referring to hive database config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"]) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_hive_mces.json", golden_path=test_resources_dir / "trino_hive_mces_golden.json", ignore_paths=[ r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]", ], )
def test_list_all(verbose: bool) -> None: # This just verifies that it runs without error. args = ["check", "plugins"] if verbose: args.append("--verbose") result = run_datahub_cmd(args) assert len(result.output.splitlines()) > 20
def test_mysql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "mysql" ) as docker_services: wait_for_port(docker_services, "testmysql", 3306) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "mysql_to_file.yml").resolve() run_datahub_cmd( ["ingest", "--strict-warnings", "-c", f"{config_file}"], tmp_path=tmp_path ) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "mysql_mces.json", golden_path=test_resources_dir / "mysql_mces_golden.json", )
def test_kafka_connect_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka-connect" test_resources_dir_kafka = pytestconfig.rootpath / "tests/integration/kafka" # Share Compose configurations between files and projects # https://docs.docker.com/compose/extends/ docker_compose_file = [ str(test_resources_dir_kafka / "docker-compose.yml"), str(test_resources_dir / "docker-compose.override.yml"), ] with docker_compose_runner(docker_compose_file, "kafka-connect") as docker_services: wait_for_port(docker_services, "test_broker", 59092, timeout=120) wait_for_port(docker_services, "test_connect", 58083, timeout=120) docker_services.wait_until_responsive( timeout=30, pause=1, check=lambda: requests.get("http://localhost:58083/connectors", ). status_code == 200, ) # Creating MySQL source with no transformations , only topic prefix r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source1", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "topic.prefix": "test-mysql-jdbc-", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}" } } """, ) assert r.status_code == 201 # Created # Creating MySQL source with regex router transformations , only topic prefix r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source2", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}", "transforms": "TotalReplacement", "transforms.TotalReplacement.type": "org.apache.kafka.connect.transforms.RegexRouter", "transforms.TotalReplacement.regex": ".*(book)", "transforms.TotalReplacement.replacement": "my-new-topic-$1" } } """, ) assert r.status_code == 201 # Created # Creating MySQL source with regex router transformations , no topic prefix, table whitelist r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source3", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "table.whitelist": "book", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}", "transforms": "TotalReplacement", "transforms.TotalReplacement.type": "org.apache.kafka.connect.transforms.RegexRouter", "transforms.TotalReplacement.regex": ".*", "transforms.TotalReplacement.replacement": "my-new-topic" } } """, ) assert r.status_code == 201 # Created # Creating MySQL source with query , topic prefix r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source4", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "query": "select * from member", "topic.prefix": "query-topic", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}" } } """, ) assert r.status_code == 201 # Created # Creating MySQL source with ExtractTopic router transformations - source dataset not added r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_source5", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSourceConnector", "mode": "incrementing", "incrementing.column.name": "id", "table.whitelist": "book", "topic.prefix": "test-mysql-jdbc2-", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}", "transforms": "changetopic", "transforms.changetopic.type": "io.confluent.connect.transforms.ExtractTopic$Value", "transforms.changetopic.field": "name" } } """, ) assert r.status_code == 201 # Created # Creating MySQL sink connector - not added r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "mysql_sink", "config": { "connector.class": "io.confluent.connect.jdbc.JdbcSinkConnector", "insert.mode": "insert", "auto.create": true, "topics": "my-topic", "tasks.max": "1", "connection.url": "${env:MYSQL_CONNECTION_URL}" } } """, ) assert r.status_code == 201 # Created # Creating Debezium MySQL source connector r = requests.post( "http://localhost:58083/connectors", headers={"Content-Type": "application/json"}, data="""{ "name": "debezium-mysql-connector", "config": { "name": "debezium-mysql-connector", "connector.class": "io.debezium.connector.mysql.MySqlConnector", "database.hostname": "test_mysql", "database.port": "3306", "database.user": "******", "database.password": "******", "database.server.name": "debezium.topics", "database.history.kafka.bootstrap.servers": "test_broker:9092", "database.history.kafka.topic": "dbhistory.debeziummysql", "include.schema.changes": "false" } } """, ) assert r.status_code == 201 # Created # Give time for connectors to process the table data time.sleep(45) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "kafka_connect_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "kafka_connect_mces.json", golden_path=test_resources_dir / "kafka_connect_mces_golden.json", ignore_paths=[], )
def test_check_mce_schema(pytestconfig: PytestConfig, json_filename: str) -> None: json_file_path = pytestconfig.rootpath / json_filename run_datahub_cmd(["check", "mce-file", f"{json_file_path}"])
def test_cli_version(): result = run_datahub_cmd(["--debug", "version"]) assert result.output
def test_cli_help(): result = run_datahub_cmd(["--help"]) assert result.output
def test_check_local_docker(): # This just verifies that it runs without error. # We don't actually know what environment this will be run in, so # we can't depend on the output. Eventually, we should mock the docker SDK. result = run_datahub_cmd(["check", "local-docker"], check_result=False) assert result.output