def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "kafka" ) as docker_services: wait_for_port(docker_services, "test_broker", 9092, timeout=120) # Set up topics and produce some data command = f"{test_resources_dir}/send_records.sh {test_resources_dir}" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): config_file = (test_resources_dir / "kafka_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert result.exit_code == 0 # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "kafka_mces.json", golden_path=test_resources_dir / "kafka_mces_golden.json", ignore_paths=[], )
def test_hive_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/hive" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "hive" ) as docker_services: wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) # Set up the container. command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): config_file = (test_resources_dir / "hive_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert result.exit_code == 0 # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "hive_mces.json", golden_path=test_resources_dir / "hive_mces_golden.json", ignore_paths=[ # example: root[1]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['CreateTime:'] # example: root[2]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['Table Parameters: transient_lastDdlTime'] r"root\[\d+\]\['proposedSnapshot'\]\['com\.linkedin\.pegasus2avro\.metadata\.snapshot\.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com\.linkedin\.pegasus2avro\.dataset\.DatasetProperties'\]\['customProperties'\]\['.*Time.*'\]" ], )
def test_mssql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/sql_server" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "sql-server") as docker_services: # Wait for SQL Server to be ready. We wait an extra couple seconds, as the port being available # does not mean the server is accepting connections. # TODO: find a better way to check for liveness. wait_for_port(docker_services, "testsqlserver", 1433) time.sleep(5) # Run the setup.sql file to populate the database. docker = "docker" command = f"{docker} exec testsqlserver /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P 'test!Password' -d master -i /setup/setup.sql" ret = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) assert ret.returncode == 0 # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "mssql_to_file.yml").resolve() runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert result.exit_code == 0 output = mce_helpers.load_json_file("mssql_mces.json") # Verify the output. golden = mce_helpers.load_json_file( str(test_resources_dir / "mssql_mce_golden.json")) mce_helpers.assert_mces_equal(output, golden)
def test_trino_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/trino" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "trino") as docker_services: wait_for_port(docker_services, "testtrino", 8080) wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) docker_services.wait_until_responsive( timeout=30, pause=1, check=lambda: requests.get("http://localhost:5300/v1/info").json()[ "starting"] is False, ) # Set up the hive db command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): print(tmp_path) # Run the metadata ingestion pipeline for trino catalog referring to postgres database config_file = (test_resources_dir / "trino_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert_result_ok(result) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_mces.json", golden_path=test_resources_dir / "trino_mces_golden.json", ) # Limitation 1 - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database. # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070 # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino. # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html # Run the metadata ingestion pipeline for trino catalog referring to hive database config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert_result_ok(result) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_hive_mces.json", golden_path=test_resources_dir / "trino_hive_mces_golden.json", ignore_paths=[ r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]", ], )
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): # Run the metadata ingestion pipeline. with fs_helpers.isolated_filesystem(tmp_path): # Run the metadata ingestion pipeline for trino catalog referring to postgres database mce_out_file = "trino_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="postgresqldb", database_alias="library_catalog", username="******", schema_pattern=AllowDenyPattern(allow=["^librarydb"]), profile_pattern=AllowDenyPattern( allow=["library_catalog.librarydb.*"]), profiling=GEProfilingConfig( enabled=True, include_field_null_count=True, include_field_min_value=True, include_field_max_value=True, include_field_mean_value=True, include_field_median_value=True, include_field_stddev_value=True, include_field_quantiles=True, include_field_distinct_value_frequencies=True, include_field_histogram=True, include_field_sample_values=True, ), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_mces.json", golden_path=test_resources_dir / "trino_mces_golden.json", )
def run_datahub_cmd( command: List[str], tmp_path: Optional[Path] = None, check_result: bool = True ) -> Result: runner = CliRunner() if tmp_path is None: result = runner.invoke(datahub, command) else: with fs_helpers.isolated_filesystem(tmp_path): result = runner.invoke(datahub, command) if check_result: assert_result_ok(result) return result
def test_mysql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/mysql" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "mysql") as docker_services: wait_for_port(docker_services, "testmysql", 3306) # Run the metadata ingestion pipeline. runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): config_file = (test_resources_dir / "mysql_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert result.exit_code == 0 # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="mysql_mces.json", golden_path=test_resources_dir / "mysql_mces_golden.json", )
def test_nifi_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/nifi" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "nifi") as docker_services: wait_for_port( docker_services, container_name="nifi1", container_port=9443, timeout=300, ) wait_for_port( docker_services, container_name="nifi01", container_port=9080, timeout=60, ) wait_for_port( docker_services, container_name="nifi02", container_port=9081, timeout=60, ) wait_for_port( docker_services, container_name="nifi03", container_port=9082, timeout=60, ) # Wait for nifi to execute all processors time.sleep(120) # Run the metadata ingestion pipeline. with fs_helpers.isolated_filesystem(tmp_path): # Run nifi ingestion run. pipeline = Pipeline.create({ "run_id": "nifi-test-standalone", "source": { "type": "nifi", "config": { "site_url": "http://localhost:9443/nifi/", # "auth": "CLIENT_CERT", # "client_cert_file": f"{test_resources_dir}/setup/ssl_files/client-cert.pem", # "client_key_file": f"{test_resources_dir}/setup/ssl_files/client-private-key.pem", # "client_key_password": "******", # "ca_file": f"{test_resources_dir}/setup/ssl_files/server_certfile.pem", "process_group_pattern": { "deny": ["^WIP"] }, }, }, "sink": { "type": "file", "config": { "filename": "./nifi_mces.json" }, }, }) pipeline.run() pipeline.raise_from_status() # Verify the output. ignore values for aspects having last_event_time values # TODO: ignore paths with respect to aspect value in case of MCPs mce_helpers.check_golden_file( pytestconfig, output_path="nifi_mces.json", golden_path=test_resources_dir / "nifi_mces_golden_standalone.json", ignore_paths=[ r"root\[1\]\['aspect'\]\['value'\]", r"root\[5\]\['aspect'\]\['value'\]", r"root\[7\]\['aspect'\]\['value'\]", ], ) # Run nifi ingestion run. pipeline = Pipeline.create({ "run_id": "nifi-test-cluster", "source": { "type": "nifi", "config": { "site_url": "http://localhost:9080/nifi/", "auth": "NO_AUTH", "site_url_to_site_name": { "http://nifi01:9080/nifi/": "default", "http://nifi02:9081/nifi/": "default", }, }, }, "sink": { "type": "file", "config": { "filename": "./nifi_mces_cluster.json" }, }, }) pipeline.run() pipeline.raise_from_status() # Verify the output. # TODO: ignore paths with respect to aspect value in case of MCPs mce_helpers.check_golden_file( pytestconfig, output_path="nifi_mces_cluster.json", golden_path=test_resources_dir / "nifi_mces_golden_cluster.json", ignore_paths=[ r"root\[5\]\['aspect'\]\['value'\]", r"root\[7\]\['aspect'\]\['value'\]", r"root\[15\]\['aspect'\]\['value'\]", r"root\[19\]\['aspect'\]\['value'\]", ], )