def test_mode_ingest_success(pytestconfig, tmp_path): with patch( "datahub.ingestion.source.mode.requests.session", side_effect=mocked_requests_sucess, ): global test_resources_dir test_resources_dir = pytestconfig.rootpath / "tests/integration/mode" pipeline = Pipeline.create({ "run_id": "mode-test", "source": { "type": "mode", "config": { "token": "xxxx", "password": "******", "connect_uri": "https://app.mode.com/", "workspace": "acryl", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/mode_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/mode_mces.json", golden_path=test_resources_dir / "mode_mces_golden.json", ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS, )
def test_mongodb_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/mongodb" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "mongo") as docker_services: wait_for_port(docker_services, "testmongodb", 27017) # Run the metadata ingestion pipeline. pipeline = Pipeline.create({ "run_id": "mongodb-test", "source": { "type": "mongodb", "config": { "connect_uri": "mongodb://localhost:57017", "username": "******", "password": "******", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/mongodb_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "mongodb_mces.json", golden_path=test_resources_dir / "mongodb_mces_golden.json", )
def test_hive_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/hive" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "hive") as docker_services: wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) # Set up the container. command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "hive_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "hive_mces.json", golden_path=test_resources_dir / "hive_mces_golden.json", ignore_paths=[ # example: root[1]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['CreateTime:'] # example: root[2]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['Table Parameters: transient_lastDdlTime'] r"root\[\d+\]\['proposedSnapshot'\]\['com\.linkedin\.pegasus2avro\.metadata\.snapshot\.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com\.linkedin\.pegasus2avro\.dataset\.DatasetProperties'\]\['customProperties'\]\['.*Time.*'\]", r"root\[6\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.schema.SchemaMetadata'\]\['fields'\]\[\d+\]\['nativeDataType'\]", ], )
def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/feast" output_path = tmp_path / "feast_repository_mces.json" pipeline = Pipeline.create({ "run_id": "feast-repository-test", "source": { "type": "feast", "config": { "path": str(test_resources_dir / "feature_store"), "environment": "PROD", }, }, "sink": { "type": "file", "config": { "filename": str(output_path), }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=output_path, golden_path=test_resources_dir / "feast_repository_mces_golden.json", )
def test_mssql_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/sql_server" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "sql-server") as docker_services: # Wait for SQL Server to be ready. We wait an extra couple seconds, as the port being available # does not mean the server is accepting connections. # TODO: find a better way to check for liveness. wait_for_port(docker_services, "testsqlserver", 1433) time.sleep(5) # Run the setup.sql file to populate the database. docker = "docker" command = f"{docker} exec testsqlserver /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P 'test!Password' -d master -i /setup/setup.sql" ret = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) assert ret.returncode == 0 # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "mssql_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path, check_result=True) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "mssql_mces.json", golden_path=test_resources_dir / "mssql_mces_golden.json", )
def test_bq_usage_source(pytestconfig, tmp_path): # from google.cloud.logging_v2 import ProtobufEntry test_resources_dir: pathlib.Path = (pytestconfig.rootpath / "tests/integration/bigquery-usage") bigquery_reference_logs_path = test_resources_dir / "bigquery_logs.json" if WRITE_REFERENCE_FILE: source = BigQueryUsageSource.create( dict( projects=[ "harshal-playground-306419", ], start_time=datetime.now(tz=timezone.utc) - timedelta(days=25), ), PipelineContext(run_id="bq-usage-test"), ) entries = list( source._get_bigquery_log_entries_via_gcp_logging( source._make_bigquery_logging_clients())) entries = [entry._replace(logger=None) for entry in entries] log_entries = jsonpickle.encode(entries, indent=4) with bigquery_reference_logs_path.open("w") as logs: logs.write(log_entries) with unittest.mock.patch( "datahub.ingestion.source.usage.bigquery_usage.GCPLoggingClient", autospec=True) as MockClient: # Add mock BigQuery API responses. with bigquery_reference_logs_path.open() as logs: reference_logs = jsonpickle.decode(logs.read()) MockClient().list_entries.return_value = reference_logs # Run a BigQuery usage ingestion run. pipeline = Pipeline.create({ "run_id": "test-bigquery-usage", "source": { "type": "bigquery-usage", "config": { "projects": ["sample-bigquery-project-1234"], "start_time": "2021-01-01T00:00Z", "end_time": "2021-07-01T00:00Z", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/bigquery_usages.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "bigquery_usages.json", golden_path=test_resources_dir / "bigquery_usages_golden.json", )
def test_data_lake_local_ingest(pytestconfig, source_file, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/" f = open(os.path.join(SOURCE_FILES_PATH, source_file)) source = json.load(f) config_dict = {} source["config"]["path_spec"]["include"] = source["config"]["path_spec"][ "include"].replace("s3://my-test-bucket/", "tests/integration/s3/test_data/local_system/") source["config"]["profiling"]["enabled"] = True source["config"].pop("aws_config") config_dict["source"] = source config_dict["sink"] = { "type": "file", "config": { "filename": f"{tmp_path}/{source_file}", }, } config_dict["run_id"] = source_file pipeline = Pipeline.create(config_dict) pipeline.run() pipeline.raise_from_status() # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/{source_file}", golden_path= f"{test_resources_dir}/golden-files/local/golden_mces_{source_file}", )
def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "kafka" ) as docker_services: wait_for_port(docker_services, "test_broker", 59092, timeout=120) wait_for_port(docker_services, "test_schema_registry", 8081, timeout=120) # Set up topics and produce some data command = f"{test_resources_dir}/send_records.sh {test_resources_dir}" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "kafka_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "kafka_mces.json", golden_path=test_resources_dir / "kafka_mces_golden.json", ignore_paths=[], )
def test_serde_to_json(pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str) -> None: golden_file = pytestconfig.rootpath / json_filename output_filename = "output.json" output_file = tmp_path / output_filename pipeline = Pipeline.create({ "source": { "type": "file", "config": { "filename": str(golden_file) } }, "sink": { "type": "file", "config": { "filename": str(output_file) } }, "run_id": "serde_test", }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/{output_filename}", golden_path=golden_file, )
def test_data_lake_s3_ingest(pytestconfig, s3_populate, source_file, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/" f = open(os.path.join(SOURCE_FILES_PATH, source_file)) source = json.load(f) config_dict = {} config_dict["source"] = source config_dict["sink"] = { "type": "file", "config": { "filename": f"{tmp_path}/{source_file}", }, } config_dict["run_id"] = source_file pipeline = Pipeline.create(config_dict) pipeline.run() pipeline.raise_from_status() # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/{source_file}", golden_path= f"{test_resources_dir}/golden-files/s3/golden_mces_{source_file}", )
def test_trino_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/trino" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "trino") as docker_services: wait_for_port(docker_services, "testtrino", 8080) wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120) docker_services.wait_until_responsive( timeout=30, pause=1, check=lambda: requests.get("http://localhost:5300/v1/info").json()[ "starting"] is False, ) # Set up the hive db command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): print(tmp_path) # Run the metadata ingestion pipeline for trino catalog referring to postgres database config_file = (test_resources_dir / "trino_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert_result_ok(result) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_mces.json", golden_path=test_resources_dir / "trino_mces_golden.json", ) # Limitation 1 - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database. # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070 # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino. # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html # Run the metadata ingestion pipeline for trino catalog referring to hive database config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert_result_ok(result) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_hive_mces.json", golden_path=test_resources_dir / "trino_hive_mces_golden.json", ignore_paths=[ r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]", ], )
def test_lookml_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" pipeline = Pipeline.create({ "run_id": "lookml-test", "source": { "type": "lookml", "config": { "base_folder": str(test_resources_dir / "lkml_samples"), "connection_to_platform_map": { "my_connection": "conn" }, "parse_table_names_from_sql": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/lookml_mces.json", }, }, }) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "lookml_mces.json", golden_path=test_resources_dir / "expected_output.json", )
def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "kafka" ) as docker_services: wait_for_port(docker_services, "test_broker", 9092, timeout=120) # Set up topics and produce some data command = f"{test_resources_dir}/send_records.sh {test_resources_dir}" subprocess.run(command, shell=True, check=True) # Run the metadata ingestion pipeline. runner = CliRunner() with fs_helpers.isolated_filesystem(tmp_path): config_file = (test_resources_dir / "kafka_to_file.yml").resolve() result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"]) assert result.exit_code == 0 # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "kafka_mces.json", golden_path=test_resources_dir / "kafka_mces_golden.json", ignore_paths=[], )
def test_dbt_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt" # test manifest, catalog, sources are generated from https://github.com/kevinhu/sample-dbt pipeline = Pipeline.create({ "run_id": "dbt-test", "source": { "type": "dbt", "config": { "manifest_path": f"{test_resources_dir}/dbt_manifest.json", "catalog_path": f"{test_resources_dir}/dbt_catalog.json", "sources_path": f"{test_resources_dir}/dbt_sources.json", "target_platform": "dbt", "load_schemas": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/dbt_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "dbt_mces.json", golden_path=test_resources_dir / "dbt_mces_golden.json", )
def test_ge_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time, **kwargs): test_resources_dir = pytestconfig.rootpath / "tests/integration/great-expectations" with docker_compose_runner( test_resources_dir / "docker-compose.yml", "great-expectations") as docker_services, mock.patch( "datahub.emitter.rest_emitter.DatahubRestEmitter.emit_mcp" ) as mock_emit_mcp: wait_for_port(docker_services, "ge_postgres", 5432) emitter = MockDatahubEmitter("") mock_emit_mcp.side_effect = emitter.emit_mcp shutil.copytree( test_resources_dir / "setup/great_expectations", tmp_path / "great_expectations", ) context = ge.DataContext.create(tmp_path) context.run_checkpoint(checkpoint_name="test_checkpoint") emitter.write_to_file(tmp_path / "ge_mcps.json") mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "ge_mcps.json", golden_path=test_resources_dir / "ge_mcps_golden.json", ignore_paths=[], )
def test_azure_ad_source_nested_groups(pytestconfig, tmp_path): test_resources_dir: pathlib.Path = (pytestconfig.rootpath / "tests/integration/azure_ad") with patch( "datahub.ingestion.source.identity.azure_ad.AzureADSource.get_token" ) as mock_token, patch( "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_users" ) as mock_users, patch( "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_groups" ) as mock_groups, patch( "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_group_members" ) as mock_group_users: mocked_functions( test_resources_dir, mock_token, mock_users, mock_groups, mock_group_users, True, ) # Run an azure usage ingestion run. pipeline = Pipeline.create({ "run_id": "test-azure-ad", "source": { "type": "azure-ad", "config": { "client_id": "00000000-0000-0000-0000-000000000000", "tenant_id": "00000000-0000-0000-0000-000000000000", "client_secret": "client_secret", "redirect": "https://login.microsoftonline.com/common/oauth2/nativeclient", "authority": "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000", "token_url": "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000/oauth2/token", "graph_url": "https://graph.microsoft.com/v1.0", "ingest_group_membership": True, "ingest_groups": True, "ingest_users": False, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/azure_ad_mces_nested_groups.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "azure_ad_mces_nested_groups.json", golden_path=test_resources_dir / "azure_ad_mces_golden_nested_groups.json", )
def test_looker_ingest(pytestconfig, tmp_path, mock_time): mocked_client = mock.MagicMock() with mock.patch( "datahub.ingestion.source.looker.LookerDashboardSource._get_looker_client", mocked_client, ): mocked_client.return_value.all_dashboards.return_value = [ Dashboard(id="1") ] mocked_client.return_value.dashboard.return_value = Dashboard( id="1", title="foo", created_at=datetime.utcfromtimestamp(time.time()), description="lorem ipsum", dashboard_elements=[ DashboardElement( id="2", type="", subtitle_text="Some text", query=Query( model="data", view="my_view", dynamic_fields= '[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]', ), ) ], ) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" pipeline = Pipeline.create({ "run_id": "looker-test", "source": { "type": "looker", "config": { "base_url": "https://looker.company.com", "client_id": "foo", "client_secret": "bar", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/looker_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "looker_mces.json", golden_path=test_resources_dir / "expected_output.json", )
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): # Run the metadata ingestion pipeline. with fs_helpers.isolated_filesystem(tmp_path): # Run the metadata ingestion pipeline for trino catalog referring to postgres database mce_out_file = "trino_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="postgresqldb", database_alias="library_catalog", username="******", schema_pattern=AllowDenyPattern(allow=["^librarydb"]), profile_pattern=AllowDenyPattern( allow=["library_catalog.librarydb.*"]), profiling=GEProfilingConfig( enabled=True, include_field_null_count=True, include_field_min_value=True, include_field_max_value=True, include_field_mean_value=True, include_field_median_value=True, include_field_stddev_value=True, include_field_quantiles=True, include_field_distinct_value_frequencies=True, include_field_histogram=True, include_field_sample_values=True, ), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_mces.json", golden_path=test_resources_dir / "trino_mces_golden.json", )
def test_tableau_ingest(pytestconfig, tmp_path): global test_resources_dir test_resources_dir = pathlib.Path( pytestconfig.rootpath / "tests/integration/tableau" ) with mock.patch("tableauserverclient.Server") as mock_sdk: mock_client = mock.Mock() mocked_metadata = mock.Mock() mocked_metadata.query.side_effect = side_effect_query_metadata mock_client.metadata = mocked_metadata mock_client.auth = mock.Mock() mock_client.auth.sign_in.return_value = None mock_client.auth.sign_out.return_value = None mock_sdk.return_value = mock_client mock_sdk._auth_token = "ABC" pipeline = Pipeline.create( { "run_id": "tableau-test", "source": { "type": "tableau", "config": { "username": "******", "password": "******", "connect_uri": "https://do-not-connect", "site": "acryl", "projects": ["default", "Project 2"], "ingest_tags": True, "ingest_owner": True, "default_schema_map": { "dvdrental": "public", "someotherdb": "schema", }, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/tableau_mces.json", }, }, } ) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/tableau_mces.json", golden_path=test_resources_dir / "tableau_mces_golden.json", ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS, )
def test_trino_hive_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): # Run the metadata ingestion pipeline for trino catalog referring to postgres database mce_out_file = "trino_hive_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-hive-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="hivedb", username="******", schema_pattern=AllowDenyPattern(allow=["^db1"]), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Limitation 1 - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database. # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070 # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino. # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html # Run the metadata ingestion pipeline for trino catalog referring to hive database # config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve() # run_datahub_cmd(["ingest", "-c", f"{config_file}"]) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=events_file, golden_path=test_resources_dir / "trino_hive_mces_golden.json", ignore_paths=[ r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]", ], )
def test_openapi_ingest(pytestconfig, tmp_path): test_resources_dir = pytestconfig.rootpath / "tests/integration/openapi" # Run the metadata ingestion pipeline. config_file = (test_resources_dir / "openapi_to_file.yml").resolve() run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="/tmp/openapi_mces.json", golden_path=test_resources_dir / "openapi_mces_golden.json", )
def test_okta_source_include_deprovisioned_suspended_users( pytestconfig, tmp_path): test_resources_dir: pathlib.Path = pytestconfig.rootpath / "tests/integration/okta" with patch( "datahub.ingestion.source.identity.okta.OktaClient") as MockClient: _init_mock_okta_client(test_resources_dir, MockClient) # Run an Okta usage ingestion run. pipeline = Pipeline.create({ "run_id": "test-okta-usage", "source": { "type": "okta", "config": { "okta_domain": "mock-domain.okta.com", "okta_api_token": "mock-okta-token", "ingest_users": "True", "ingest_groups": "True", "ingest_group_membership": "True", "okta_profile_to_username_attr": "login", "okta_profile_to_username_regex": "([^@]+)", "okta_profile_to_group_name_attr": "name", "okta_profile_to_group_name_regex": "(.*)", "include_deprovisioned_users": "True", "include_suspended_users": "True", "page_size": "2", "delay_seconds": "0.00", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/okta_mces_include_deprovisioned_suspended_users.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "okta_mces_include_deprovisioned_suspended_users.json", golden_path=test_resources_dir / "okta_mces_golden_include_deprovisioned_suspended_users.json", )
def test_redshift_usage_filtering(pytestconfig, tmp_path): test_resources_dir = pathlib.Path(pytestconfig.rootpath / "tests/integration/redshift-usage") with patch("datahub.ingestion.source.usage.redshift_usage.Engine.execute" ) as mock_engine_execute: access_events = load_access_events(test_resources_dir) mock_engine_execute.return_value = access_events # Run ingestion pipeline = Pipeline.create( { "run_id": "test-redshift-usage", "source": { "type": "redshift-usage", "config": { "host_port": "xxxxx", "database": "xxxxx", "username": "******", "password": "******", "email_domain": "acryl.io", "include_views": True, "include_tables": True, "schema_pattern": { "allow": ["public"] }, "table_pattern": { "deny": ["orders"] }, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/redshift_usages.json" }, }, }, ) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig=pytestconfig, output_path=tmp_path / "redshift_usages.json", golden_path=test_resources_dir / "redshift_usages_filtered_golden.json", )
def test_sagemaker_ingest(tmp_path, pytestconfig): sagemaker_source_instance = sagemaker_source() with Stubber(sagemaker_source_instance.sagemaker_client) as sagemaker_stubber: sagemaker_stubber.add_response( "list_feature_groups", list_feature_groups_response, {}, ) sagemaker_stubber.add_response( "describe_feature_group", describe_feature_group_response_1, { "FeatureGroupName": "test-2", }, ) sagemaker_stubber.add_response( "describe_feature_group", describe_feature_group_response_2, { "FeatureGroupName": "test-1", }, ) sagemaker_stubber.add_response( "describe_feature_group", describe_feature_group_response_3, { "FeatureGroupName": "test", }, ) mce_objects = [ wu.mce.to_obj() for wu in sagemaker_source_instance.get_workunits() ] with open(str(tmp_path / "sagemaker_mces.json"), "w") as f: json.dump(mce_objects, f, indent=2) # Verify the output. test_resources_dir = pytestconfig.rootpath / "tests/unit/sagemaker" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "sagemaker_mces.json", golden_path=test_resources_dir / "sagemaker_mces_golden.json", )
def test_redshift_usage_source(pytestconfig, tmp_path): test_resources_dir = pathlib.Path(pytestconfig.rootpath / "tests/integration/redshift-usage") with patch("datahub.ingestion.source.usage.redshift_usage.Engine.execute" ) as mock_engine_execute: raw_access_events: List[Dict] = load_access_events(test_resources_dir) mock_engine_execute.return_value = raw_access_events # Run ingestion pipeline = Pipeline.create( { "run_id": "test-redshift-usage", "source": { "type": "redshift-usage", "config": { "host_port": "xxxxx", "database": "xxxxx", "username": "******", "password": "******", "email_domain": "acryl.io", "include_views": True, "include_tables": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/redshift_usages.json" }, }, }, ) pipeline.run() pipeline.raise_from_status() # There should be 2 calls (usage aspects -1, operation aspects -1). assert mock_engine_execute.call_count == 2 source_report: RedshiftUsageSourceReport = cast( RedshiftUsageSourceReport, pipeline.source.get_report()) assert source_report.num_usage_workunits_emitted == 3 assert source_report.num_operational_stats_workunits_emitted == 3 mce_helpers.check_golden_file( pytestconfig=pytestconfig, output_path=tmp_path / "redshift_usages.json", golden_path=test_resources_dir / "redshift_usages_golden.json", )
def ingestion_test( pytestconfig: Any, tmp_path: pathlib.Path, mock_time: int, mock_connection: DBConnection, ) -> None: # noqa : No need for type annotations here test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" mce_out_file = f"lookml_mces_api_{mock_connection.dialect_name}.json" mocked_client = mock.MagicMock() mock_model = mock.MagicMock(project_name="lkml_samples") with mock.patch("looker_sdk.init31") as mock_sdk: mock_sdk.return_value = mocked_client # mock_connection = mock.MagicMock() mocked_client.connection.return_value = mock_connection mocked_client.lookml_model.return_value = mock_model pipeline = Pipeline.create({ "run_id": "lookml-test", "source": { "type": "lookml", "config": { "base_folder": str(test_resources_dir / "lkml_samples"), "api": { "client_id": "fake_client_id", "client_secret": "fake_secret", "base_url": "fake_account.looker.com", }, "parse_table_names_from_sql": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/{mce_out_file}", }, }, }) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / mce_out_file, golden_path=test_resources_dir / mce_out_file, )
def test_data_lake_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/data_lake/" # Run the metadata ingestion pipeline. pipeline = Pipeline.create({ "run_id": "data-lake-test", "source": { "type": "data-lake", "config": { "base_path": str(test_resources_dir / "test_data"), "use_relative_path": True, # should be enabled for testing since full paths will differ on different machines "path_spec": "./{name[0]}/{name[1]}.{format}", "platform": "data-lake-test", "profiling": { "enabled": True, "profile_table_level_only": False, "include_field_min_value": True, "include_field_max_value": True, "include_field_mean_value": True, "include_field_median_value": True, "include_field_stddev_value": True, "include_field_quantiles": True, "include_field_distinct_value_frequencies": True, "include_field_histogram": True, "include_field_sample_values": True, }, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/data_lake_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "data_lake_mces.json", golden_path=test_resources_dir / "data_lake_mces_golden.json", )
def test_trino_usage_source(pytestconfig, tmp_path): test_resources_dir = pathlib.Path( pytestconfig.rootpath / "tests/integration/starburst-trino-usage") with patch( "datahub.ingestion.source.usage.starburst_trino_usage.TrinoUsageSource._get_trino_history" ) as mock_event_history: access_events = load_access_events(test_resources_dir) mock_event_history.return_value = access_events # Run ingestion pipeline = Pipeline.create( { "run_id": "test-trino-usage", "source": { "type": "starburst-trino-usage", "config": { "host_port": "xxxxx", "database": "testcatalog", "username": "******", "password": "******", "audit_catalog": "test", "audit_schema": "test", "email_domain": "acryl.io", "include_views": True, "include_tables": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/trino_usages.json" }, }, }, ) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig=pytestconfig, output_path=tmp_path / "trino_usages.json", golden_path=test_resources_dir / "trino_usages_golden.json", )
def test_dbt_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt" config_variants = [ DbtTestConfig( "dbt-test-with-schemas", test_resources_dir, tmp_path, "dbt_with_schemas_mces.json", "dbt_with_schemas_mces_golden.json", source_config_modifiers={"load_schemas": True}, ), DbtTestConfig( "dbt-test-without-schemas", test_resources_dir, tmp_path, "dbt_without_schemas_mces.json", "dbt_without_schemas_mces_golden.json", source_config_modifiers={"load_schemas": False}, ), ] for config in config_variants: # test manifest, catalog, sources are generated from https://github.com/kevinhu/sample-dbt pipeline = Pipeline.create({ "run_id": config.run_id, "source": { "type": "dbt", "config": config.source_config }, "sink": { "type": "file", "config": config.sink_config, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=config.output_path, golden_path=config.golden_path, )
def test_lookml_bad_sql_parser(pytestconfig, tmp_path, mock_time): """Incorrect specification of sql parser should not fail ingestion""" test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" mce_out = "lookml_mces_badsql_parser.json" pipeline = Pipeline.create({ "run_id": "lookml-test", "source": { "type": "lookml", "config": { "base_folder": str(test_resources_dir / "lkml_samples"), "connection_to_platform_map": { "my_connection": { "platform": "snowflake", "default_db": "default_db", "default_schema": "default_schema", } }, "parse_table_names_from_sql": True, "project_name": "lkml_samples", "sql_parser": "bad.sql.Parser", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/{mce_out}", }, }, }) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=False) try: pipeline.raise_from_status(raise_warnings=True) assert False, "Pipeline should have generated warnings" except PipelineExecutionError: pass mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / mce_out, golden_path=test_resources_dir / mce_out, )