def test_trino_instance_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): instance = "production_warehouse" platform = "trino" mce_out_file = "trino_instance_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-hive-instance-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="hivedb", username="******", platform_instance="production_warehouse", schema_pattern=AllowDenyPattern(allow=["^db1"]), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Assert that all events generated have instance specific urns urn_pattern = "^" + re.escape( f"urn:li:dataset:(urn:li:dataPlatform:{platform},{instance}.") assert (mce_helpers.assert_mce_entity_urn( "ALL", entity_type="dataset", regex_pattern=urn_pattern, file=events_file, ) >= 0), "There should be at least one match" assert (mce_helpers.assert_mcp_entity_urn( "ALL", entity_type="dataset", regex_pattern=urn_pattern, file=events_file, ) >= 0), "There should be at least one MCP" # all dataset entities emitted must have a dataPlatformInstance aspect emitted # there must be at least one entity emitted assert (mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="dataPlatformInstance", aspect_field_matcher={ "instance": f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:{platform},{instance})" }, file=events_file, ) >= 1)
def test_pipeline_process_commits(self, commit_policy, source, should_commit): pipeline = Pipeline.create( { "source": {"type": f"tests.unit.test_pipeline.{source}"}, "sink": {"type": "console"}, "run_id": "pipeline_test", } ) class FakeCommittable(Committable): def __init__(self, commit_policy: CommitPolicy): self.name = "test_checkpointer" self.commit_policy = commit_policy def commit(self) -> None: pass fake_committable: Committable = FakeCommittable(commit_policy) with patch.object( FakeCommittable, "commit", wraps=fake_committable.commit ) as mock_commit: pipeline.ctx.register_reporter(fake_committable) pipeline.run() # check that we called the commit method once only if should_commit is True if should_commit: mock_commit.assert_called_once() else: mock_commit.assert_not_called()
def test_dbt_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt" # test manifest, catalog, sources are generated from https://github.com/kevinhu/sample-dbt pipeline = Pipeline.create({ "run_id": "dbt-test", "source": { "type": "dbt", "config": { "manifest_path": f"{test_resources_dir}/dbt_manifest.json", "catalog_path": f"{test_resources_dir}/dbt_catalog.json", "sources_path": f"{test_resources_dir}/dbt_sources.json", "target_platform": "dbt", "load_schemas": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/dbt_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "dbt_mces.json", golden_path=test_resources_dir / "dbt_mces_golden.json", )
def test_serde_large(pytestconfig, tmp_path): json_filename = "test_serde_large.json" output_filename = "output.json" test_resources_dir = pytestconfig.rootpath / "tests/unit/serde" golden_file = test_resources_dir / json_filename output_file = tmp_path / output_filename pipeline = Pipeline.create({ "source": { "type": "file", "config": { "filename": str(golden_file) } }, "sink": { "type": "file", "config": { "filename": str(output_file) } }, }) pipeline.run() pipeline.raise_from_status() output = mce_helpers.load_json_file(tmp_path / output_filename) golden = mce_helpers.load_json_file(golden_file) mce_helpers.assert_mces_equal(output, golden)
def test_data_lake_s3_ingest(pytestconfig, s3_populate, source_file, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/" f = open(os.path.join(SOURCE_FILES_PATH, source_file)) source = json.load(f) config_dict = {} config_dict["source"] = source config_dict["sink"] = { "type": "file", "config": { "filename": f"{tmp_path}/{source_file}", }, } config_dict["run_id"] = source_file pipeline = Pipeline.create(config_dict) pipeline.run() pipeline.raise_from_status() # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/{source_file}", golden_path= f"{test_resources_dir}/golden-files/s3/golden_mces_{source_file}", )
def create_and_run_test_pipeline( events: List[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]], transformers: List[Dict[str, Any]], path: str, ) -> str: with mock.patch("tests.unit.test_source.TestSource.get_workunits" ) as mock_getworkunits: mock_getworkunits.return_value = [ workunit.MetadataWorkUnit( id=f"test-workunit-mce-{e.proposedSnapshot.urn}", mce=e) if isinstance(e, MetadataChangeEventClass) else workunit.MetadataWorkUnit( id=f"test-workunit-mcp-{e.entityUrn}-{e.aspectName}", mcp=e) for e in events ] events_file = f"{path}/{str(uuid4())}.json" pipeline = Pipeline.create( config_dict={ "source": { "type": "tests.unit.test_source.TestSource", "config": {}, }, "transformers": transformers, "sink": { "type": "file", "config": { "filename": events_file } }, }) pipeline.run() pipeline.raise_from_status() return events_file
def test_ldap_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/ldap" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "ldap") as docker_services: # The openldap container loads the sample data after exposing the port publicly. As such, # we must wait a little bit extra to ensure that the sample data is loaded. wait_for_port(docker_services, "openldap", 389) time.sleep(5) pipeline = Pipeline.create({ "run_id": "ldap-test", "source": { "type": "ldap", "config": { "ldap_server": "ldap://localhost", "ldap_user": "******", "ldap_password": "******", "base_dn": "dc=example,dc=org", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/ldap_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() output = mce_helpers.load_json_file(str(tmp_path / "ldap_mces.json")) golden = mce_helpers.load_json_file( str(test_resources_dir / "ldap_mces_golden.json")) mce_helpers.assert_mces_equal(output, golden)
def test_serde_to_json(pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str) -> None: golden_file = pytestconfig.rootpath / json_filename output_filename = "output.json" output_file = tmp_path / output_filename pipeline = Pipeline.create({ "source": { "type": "file", "config": { "filename": str(golden_file) } }, "sink": { "type": "file", "config": { "filename": str(output_file) } }, "run_id": "serde_test", }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/{output_filename}", golden_path=golden_file, )
def test_data_lake_local_ingest(pytestconfig, source_file, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/" f = open(os.path.join(SOURCE_FILES_PATH, source_file)) source = json.load(f) config_dict = {} source["config"]["path_spec"]["include"] = source["config"]["path_spec"][ "include"].replace("s3://my-test-bucket/", "tests/integration/s3/test_data/local_system/") source["config"]["profiling"]["enabled"] = True source["config"].pop("aws_config") config_dict["source"] = source config_dict["sink"] = { "type": "file", "config": { "filename": f"{tmp_path}/{source_file}", }, } config_dict["run_id"] = source_file pipeline = Pipeline.create(config_dict) pipeline.run() pipeline.raise_from_status() # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/{source_file}", golden_path= f"{test_resources_dir}/golden-files/local/golden_mces_{source_file}", )
def test_dbt_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt" pipeline = Pipeline.create( { "run_id": "dbt-test", "source": { "type": "dbt", "config": { "manifest_path": f"{test_resources_dir}/dbt_manifest.json", "catalog_path": f"{test_resources_dir}/dbt_catalog.json", "target_platform": "dbt", "load_schemas": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/dbt_mces.json", }, }, } ) pipeline.run() pipeline.raise_from_status() output = mce_helpers.load_json_file(str(tmp_path / "dbt_mces.json")) golden = mce_helpers.load_json_file( str(test_resources_dir / "dbt_mces_golden.json") ) mce_helpers.assert_mces_equal(output, golden)
def test_run_including_fake_transformation(self): pipeline = Pipeline.create({ "source": { "type": "tests.unit.test_pipeline.FakeSource" }, "transformers": [{ "type": "tests.unit.test_pipeline.AddStatusRemovedTransformer" }], "sink": { "type": "tests.test_helpers.sink_helpers.RecordingSink" }, }) pipeline.run() pipeline.raise_from_status() expected_mce = get_initial_mce() dataset_snapshot = cast(DatasetSnapshotClass, expected_mce.proposedSnapshot) dataset_snapshot.aspects.append(get_status_removed_aspect()) sink_report: RecordingSinkReport = cast(RecordingSinkReport, pipeline.sink.get_report()) self.assertEqual(len(sink_report.received_records), 1) self.assertEqual(expected_mce, sink_report.received_records[0].record)
def test_configure_with_rest_sink_initializes_graph( self, mock_source, mock_test_connection ): pipeline = Pipeline.create( { "source": { "type": "file", "config": {"filename": "test_events.json"}, }, "sink": { "type": "datahub-rest", "config": { "server": "http://somehost.someplace.some:8080", "token": "foo", }, }, } ) # assert that the default sink config is for a DatahubRestSink assert isinstance(pipeline.config.sink, DynamicTypedConfig) assert pipeline.config.sink.type == "datahub-rest" assert pipeline.config.sink.config == { "server": "http://somehost.someplace.some:8080", "token": "foo", } assert pipeline.ctx.graph is not None, "DataHubGraph should be initialized" assert pipeline.ctx.graph.config.server == pipeline.config.sink.config["server"] assert pipeline.ctx.graph.config.token == pipeline.config.sink.config["token"]
def test_lookml_ingest(pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml" pipeline = Pipeline.create({ "run_id": "lookml-test", "source": { "type": "lookml", "config": { "base_folder": str(test_resources_dir), "connection_to_platform_map": { "my_connection": "conn" }, "parse_table_names_from_sql": True, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/lookml_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "lookml_mces.json", golden_path=test_resources_dir / "expected_output.json", )
def test_serde_to_json(pytestconfig: PytestConfig, tmp_path: pathlib.Path, json_filename: str) -> None: golden_file = pytestconfig.rootpath / json_filename output_filename = "output.json" output_file = tmp_path / output_filename pipeline = Pipeline.create({ "source": { "type": "file", "config": { "filename": str(golden_file) } }, "sink": { "type": "file", "config": { "filename": str(output_file) } }, }) pipeline.run() pipeline.raise_from_status() output = mce_helpers.load_json_file(tmp_path / output_filename) golden = mce_helpers.load_json_file(golden_file) assert golden == output
def datahub_recipe(): with open("path/to/recipe.yml") as config_file: config = yaml.safe_load(config_file) pipeline = Pipeline.create(config) pipeline.run() pipeline.raise_from_status()
def test_mongodb_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/mongodb" with docker_compose_runner(test_resources_dir / "docker-compose.yml", "mongo") as docker_services: wait_for_port(docker_services, "testmongodb", 27017) # Run the metadata ingestion pipeline. pipeline = Pipeline.create({ "run_id": "mongodb-test", "source": { "type": "mongodb", "config": { "connect_uri": "mongodb://localhost:57017", "username": "******", "password": "******", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/mongodb_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() # Verify the output. output = mce_helpers.load_json_file(str(tmp_path / "mongodb_mces.json")) golden = mce_helpers.load_json_file( str(test_resources_dir / "mongodb_mce_golden.json")) mce_helpers.assert_mces_equal(output, golden)
def run( ctx: click.Context, config: str, dry_run: bool, preview: bool, strict_warnings: bool, preview_workunits: int, ) -> None: """Ingest metadata into DataHub.""" logger.info("DataHub CLI version: %s", datahub_package.nice_version_name()) config_file = pathlib.Path(config) pipeline_config = load_config_file(config_file) try: logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config, dry_run, preview, preview_workunits) except ValidationError as e: click.echo(e, err=True) sys.exit(1) except Exception as e: # The pipeline_config may contain sensitive information, so we wrap the exception # in a SensitiveError to prevent detailed variable-level information from being logged. raise SensitiveError() from e logger.info("Starting metadata ingestion") pipeline.run() logger.info("Finished metadata ingestion") ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings) pipeline.log_ingestion_stats() sys.exit(ret)
def ingest(config: str) -> None: """Main command for ingesting metadata into DataHub""" config_file = pathlib.Path(config) if not config_file.is_file(): raise ConfigurationError(f"Cannot open config file {config}") config_mech: ConfigurationMechanism if config_file.suffix in [".yaml", ".yml"]: config_mech = YamlConfigurationMechanism() elif config_file.suffix == ".toml": config_mech = TomlConfigurationMechanism() else: raise ConfigurationError( "Only .toml and .yml are supported. Cannot process file type {}".format( config_file.suffix ) ) with config_file.open() as fp: pipeline_config = config_mech.load_config(fp) try: logger.info(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config) except ValidationError as e: click.echo(e, err=True) sys.exit(1) pipeline.run() ret = pipeline.pretty_print_summary() sys.exit(ret)
def ingest(config: str): """Main command for ingesting metadata into DataHub""" config_file = pathlib.Path(config) if not config_file.is_file(): raise ConfigurationError(f"Cannot open config file {config}") config_mech: ConfigurationMechanism if config_file.suffix in [".yaml", ".yml"]: config_mech = YamlConfigurationMechanism() elif config_file.suffix == ".toml": config_mech = TomlConfigurationMechanism() else: raise ConfigurationError( "Only .toml and .yml are supported. Cannot process file type {}". format(config_file.suffix)) with config_file.open() as fp: pipeline_config = config_mech.load_config(fp) with nicely_formatted_validation_errors(): logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary()
def test_mongodb_ingest(mongodb, pytestconfig, tmp_path, mock_time): test_resources_dir = pytestconfig.rootpath / "tests/integration/mongodb" pipeline = Pipeline.create({ "run_id": "mongodb-test", "source": { "type": "mongodb", "config": { "connect_uri": "mongodb://localhost:57017", "username": "******", "password": "******", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/mongodb_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() output = mce_helpers.load_json_file(str(tmp_path / "mongodb_mces.json")) golden = mce_helpers.load_json_file( str(test_resources_dir / "mongodb_mce_golden.json")) mce_helpers.assert_mces_equal(output, golden)
def test_azure_ad_source_nested_groups(pytestconfig, tmp_path): test_resources_dir: pathlib.Path = (pytestconfig.rootpath / "tests/integration/azure_ad") with patch( "datahub.ingestion.source.identity.azure_ad.AzureADSource.get_token" ) as mock_token, patch( "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_users" ) as mock_users, patch( "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_groups" ) as mock_groups, patch( "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_group_members" ) as mock_group_users: mocked_functions( test_resources_dir, mock_token, mock_users, mock_groups, mock_group_users, True, ) # Run an azure usage ingestion run. pipeline = Pipeline.create({ "run_id": "test-azure-ad", "source": { "type": "azure-ad", "config": { "client_id": "00000000-0000-0000-0000-000000000000", "tenant_id": "00000000-0000-0000-0000-000000000000", "client_secret": "client_secret", "redirect": "https://login.microsoftonline.com/common/oauth2/nativeclient", "authority": "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000", "token_url": "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000/oauth2/token", "graph_url": "https://graph.microsoft.com/v1.0", "ingest_group_membership": True, "ingest_groups": True, "ingest_users": False, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/azure_ad_mces_nested_groups.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "azure_ad_mces_nested_groups.json", golden_path=test_resources_dir / "azure_ad_mces_golden_nested_groups.json", )
def test_bq_usage_source(pytestconfig, tmp_path): # from google.cloud.logging_v2 import ProtobufEntry test_resources_dir: pathlib.Path = (pytestconfig.rootpath / "tests/integration/bigquery-usage") bigquery_reference_logs_path = test_resources_dir / "bigquery_logs.json" if WRITE_REFERENCE_FILE: source = BigQueryUsageSource.create( dict( projects=[ "harshal-playground-306419", ], start_time=datetime.now(tz=timezone.utc) - timedelta(days=25), ), PipelineContext(run_id="bq-usage-test"), ) entries = list( source._get_bigquery_log_entries(source._make_bigquery_clients())) entries = [entry._replace(logger=None) for entry in entries] log_entries = jsonpickle.encode(entries, indent=4) with bigquery_reference_logs_path.open("w") as logs: logs.write(log_entries) with unittest.mock.patch( "datahub.ingestion.source.usage.bigquery_usage.GCPLoggingClient", autospec=True) as MockClient: # Add mock BigQuery API responses. with bigquery_reference_logs_path.open() as logs: reference_logs = jsonpickle.decode(logs.read()) MockClient().list_entries.return_value = reference_logs # Run a BigQuery usage ingestion run. pipeline = Pipeline.create({ "run_id": "test-bigquery-usage", "source": { "type": "bigquery-usage", "config": { "projects": ["sample-bigquery-project-1234"], "start_time": "2021-01-01T00:00Z", "end_time": "2021-07-01T00:00Z", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/bigquery_usages.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "bigquery_usages.json", golden_path=test_resources_dir / "bigquery_usages_golden.json", )
def get_current_checkpoint_from_pipeline( pipeline_config_dict: Dict[str, Any]) -> Optional[Checkpoint]: pipeline = Pipeline.create(pipeline_config_dict) pipeline.run() pipeline.raise_from_status() mysql_source = cast(MySQLSource, pipeline.source) return mysql_source.get_current_checkpoint( mysql_source.get_default_ingestion_job_id())
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): # Run the metadata ingestion pipeline. with fs_helpers.isolated_filesystem(tmp_path): # Run the metadata ingestion pipeline for trino catalog referring to postgres database mce_out_file = "trino_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="postgresqldb", database_alias="library_catalog", username="******", schema_pattern=AllowDenyPattern(allow=["^librarydb"]), profile_pattern=AllowDenyPattern( allow=["library_catalog.librarydb.*"]), profiling=GEProfilingConfig( enabled=True, include_field_null_count=True, include_field_min_value=True, include_field_max_value=True, include_field_mean_value=True, include_field_median_value=True, include_field_stddev_value=True, include_field_quantiles=True, include_field_distinct_value_frequencies=True, include_field_histogram=True, include_field_sample_values=True, ), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_mces.json", golden_path=test_resources_dir / "trino_mces_golden.json", )
def test_looker_ingest(pytestconfig, tmp_path, mock_time): mocked_client = mock.MagicMock() with mock.patch( "datahub.ingestion.source.looker.LookerDashboardSource._get_looker_client", mocked_client, ): mocked_client.return_value.all_dashboards.return_value = [ Dashboard(id="1") ] mocked_client.return_value.dashboard.return_value = Dashboard( id="1", title="foo", created_at=datetime.utcfromtimestamp(time.time()), description="lorem ipsum", dashboard_elements=[ DashboardElement( id="2", type="", subtitle_text="Some text", query=Query( model="data", view="my_view", dynamic_fields= '[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]', ), ) ], ) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" pipeline = Pipeline.create({ "run_id": "looker-test", "source": { "type": "looker", "config": { "base_url": "https://looker.company.com", "client_id": "foo", "client_secret": "bar", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/looker_mces.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "looker_mces.json", golden_path=test_resources_dir / "expected_output.json", )
def test_configure(self, mock_sink, mock_source): pipeline = Pipeline.create( { "source": {"type": "kafka", "config": {"bootstrap": "localhost:9092"}}, "sink": {"type": "console"}, } ) pipeline.run() pipeline.raise_from_status() mock_source.assert_called_once() mock_sink.assert_called_once()
def test_tableau_ingest(pytestconfig, tmp_path): global test_resources_dir test_resources_dir = pathlib.Path( pytestconfig.rootpath / "tests/integration/tableau" ) with mock.patch("tableauserverclient.Server") as mock_sdk: mock_client = mock.Mock() mocked_metadata = mock.Mock() mocked_metadata.query.side_effect = side_effect_query_metadata mock_client.metadata = mocked_metadata mock_client.auth = mock.Mock() mock_client.auth.sign_in.return_value = None mock_client.auth.sign_out.return_value = None mock_sdk.return_value = mock_client mock_sdk._auth_token = "ABC" pipeline = Pipeline.create( { "run_id": "tableau-test", "source": { "type": "tableau", "config": { "username": "******", "password": "******", "connect_uri": "https://do-not-connect", "site": "acryl", "projects": ["default", "Project 2"], "ingest_tags": True, "ingest_owner": True, "default_schema_map": { "dvdrental": "public", "someotherdb": "schema", }, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/tableau_mces.json", }, }, } ) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=f"{tmp_path}/tableau_mces.json", golden_path=test_resources_dir / "tableau_mces_golden.json", ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS, )
def test_trino_hive_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): # Run the metadata ingestion pipeline for trino catalog referring to postgres database mce_out_file = "trino_hive_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-hive-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="hivedb", username="******", schema_pattern=AllowDenyPattern(allow=["^db1"]), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Limitation 1 - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database. # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070 # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino. # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html # Run the metadata ingestion pipeline for trino catalog referring to hive database # config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve() # run_datahub_cmd(["ingest", "-c", f"{config_file}"]) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=events_file, golden_path=test_resources_dir / "trino_hive_mces_golden.json", ignore_paths=[ r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]", ], )
def run( ctx: click.Context, config: str, dry_run: bool, preview: bool, strict_warnings: bool, preview_workunits: int, suppress_error_logs: bool, ) -> None: """Ingest metadata into DataHub.""" logger.info("DataHub CLI version: %s", datahub_package.nice_version_name()) config_file = pathlib.Path(config) pipeline_config = load_config_file(config_file) try: logger.debug(f"Using config: {pipeline_config}") pipeline = Pipeline.create(pipeline_config, dry_run, preview, preview_workunits) except ValidationError as e: click.echo(e, err=True) sys.exit(1) except Exception as e: # The pipeline_config may contain sensitive information, so we wrap the exception # in a SensitiveError to prevent detailed variable-level information from being logged. raise SensitiveError() from e logger.info("Starting metadata ingestion") try: pipeline.run() except Exception as e: logger.info( f"Source ({pipeline.config.source.type}) report:\n{pipeline.source.get_report().as_string()}" ) logger.info( f"Sink ({pipeline.config.sink.type}) report:\n{pipeline.sink.get_report().as_string()}" ) # We dont want to log sensitive information in variables if the pipeline fails due to # an unexpected error. Disable printing sensitive info to logs if ingestion is running # with `--suppress-error-logs` flag. if suppress_error_logs: raise SensitiveError() from e else: raise e else: logger.info("Finished metadata pipeline") pipeline.log_ingestion_stats() ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings) sys.exit(ret)
def test_okta_source_include_deprovisioned_suspended_users( pytestconfig, tmp_path): test_resources_dir: pathlib.Path = pytestconfig.rootpath / "tests/integration/okta" with patch( "datahub.ingestion.source.identity.okta.OktaClient") as MockClient: _init_mock_okta_client(test_resources_dir, MockClient) # Run an Okta usage ingestion run. pipeline = Pipeline.create({ "run_id": "test-okta-usage", "source": { "type": "okta", "config": { "okta_domain": "mock-domain.okta.com", "okta_api_token": "mock-okta-token", "ingest_users": "True", "ingest_groups": "True", "ingest_group_membership": "True", "okta_profile_to_username_attr": "login", "okta_profile_to_username_regex": "([^@]+)", "okta_profile_to_group_name_attr": "name", "okta_profile_to_group_name_regex": "(.*)", "include_deprovisioned_users": "True", "include_suspended_users": "True", "page_size": "2", "delay_seconds": "0.00", }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/okta_mces_include_deprovisioned_suspended_users.json", }, }, }) pipeline.run() pipeline.raise_from_status() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "okta_mces_include_deprovisioned_suspended_users.json", golden_path=test_resources_dir / "okta_mces_golden_include_deprovisioned_suspended_users.json", )