Example #1
0
def test_mode_ingest_success(pytestconfig, tmp_path):
    with patch(
            "datahub.ingestion.source.mode.requests.session",
            side_effect=mocked_requests_sucess,
    ):
        global test_resources_dir
        test_resources_dir = pytestconfig.rootpath / "tests/integration/mode"

        pipeline = Pipeline.create({
            "run_id": "mode-test",
            "source": {
                "type": "mode",
                "config": {
                    "token": "xxxx",
                    "password": "******",
                    "connect_uri": "https://app.mode.com/",
                    "workspace": "acryl",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/mode_mces.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=f"{tmp_path}/mode_mces.json",
            golden_path=test_resources_dir / "mode_mces_golden.json",
            ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS,
        )
def test_mongodb_ingest(docker_compose_runner, pytestconfig, tmp_path,
                        mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/mongodb"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "mongo") as docker_services:
        wait_for_port(docker_services, "testmongodb", 27017)

        # Run the metadata ingestion pipeline.
        pipeline = Pipeline.create({
            "run_id": "mongodb-test",
            "source": {
                "type": "mongodb",
                "config": {
                    "connect_uri": "mongodb://localhost:57017",
                    "username": "******",
                    "password": "******",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/mongodb_mces.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "mongodb_mces.json",
            golden_path=test_resources_dir / "mongodb_mces_golden.json",
        )
Example #3
0
def test_hive_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/hive"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "hive") as docker_services:
        wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)

        # Set up the container.
        command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir / "hive_to_file.yml").resolve()
        run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "hive_mces.json",
            golden_path=test_resources_dir / "hive_mces_golden.json",
            ignore_paths=[
                # example: root[1]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['CreateTime:']
                # example: root[2]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['Table Parameters: transient_lastDdlTime']
                r"root\[\d+\]\['proposedSnapshot'\]\['com\.linkedin\.pegasus2avro\.metadata\.snapshot\.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com\.linkedin\.pegasus2avro\.dataset\.DatasetProperties'\]\['customProperties'\]\['.*Time.*'\]",
                r"root\[6\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.schema.SchemaMetadata'\]\['fields'\]\[\d+\]\['nativeDataType'\]",
            ],
        )
Example #4
0
def test_feast_repository_ingest(pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/feast"
    output_path = tmp_path / "feast_repository_mces.json"

    pipeline = Pipeline.create({
        "run_id": "feast-repository-test",
        "source": {
            "type": "feast",
            "config": {
                "path": str(test_resources_dir / "feature_store"),
                "environment": "PROD",
            },
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": str(output_path),
            },
        },
    })

    pipeline.run()
    pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=output_path,
        golden_path=test_resources_dir / "feast_repository_mces_golden.json",
    )
Example #5
0
def test_mssql_ingest(docker_compose_runner, pytestconfig, tmp_path,
                      mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/sql_server"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "sql-server") as docker_services:
        # Wait for SQL Server to be ready. We wait an extra couple seconds, as the port being available
        # does not mean the server is accepting connections.
        # TODO: find a better way to check for liveness.
        wait_for_port(docker_services, "testsqlserver", 1433)
        time.sleep(5)

        # Run the setup.sql file to populate the database.
        docker = "docker"
        command = f"{docker} exec testsqlserver /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P 'test!Password' -d master -i /setup/setup.sql"
        ret = subprocess.run(command,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        assert ret.returncode == 0

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir / "mssql_to_file.yml").resolve()
        run_datahub_cmd(["ingest", "-c", f"{config_file}"],
                        tmp_path=tmp_path,
                        check_result=True)

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "mssql_mces.json",
            golden_path=test_resources_dir / "mssql_mces_golden.json",
        )
Example #6
0
def test_bq_usage_source(pytestconfig, tmp_path):
    # from google.cloud.logging_v2 import ProtobufEntry

    test_resources_dir: pathlib.Path = (pytestconfig.rootpath /
                                        "tests/integration/bigquery-usage")
    bigquery_reference_logs_path = test_resources_dir / "bigquery_logs.json"

    if WRITE_REFERENCE_FILE:
        source = BigQueryUsageSource.create(
            dict(
                projects=[
                    "harshal-playground-306419",
                ],
                start_time=datetime.now(tz=timezone.utc) - timedelta(days=25),
            ),
            PipelineContext(run_id="bq-usage-test"),
        )
        entries = list(
            source._get_bigquery_log_entries_via_gcp_logging(
                source._make_bigquery_logging_clients()))

        entries = [entry._replace(logger=None) for entry in entries]
        log_entries = jsonpickle.encode(entries, indent=4)
        with bigquery_reference_logs_path.open("w") as logs:
            logs.write(log_entries)

    with unittest.mock.patch(
            "datahub.ingestion.source.usage.bigquery_usage.GCPLoggingClient",
            autospec=True) as MockClient:
        # Add mock BigQuery API responses.
        with bigquery_reference_logs_path.open() as logs:
            reference_logs = jsonpickle.decode(logs.read())
        MockClient().list_entries.return_value = reference_logs

        # Run a BigQuery usage ingestion run.
        pipeline = Pipeline.create({
            "run_id": "test-bigquery-usage",
            "source": {
                "type": "bigquery-usage",
                "config": {
                    "projects": ["sample-bigquery-project-1234"],
                    "start_time": "2021-01-01T00:00Z",
                    "end_time": "2021-07-01T00:00Z",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/bigquery_usages.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "bigquery_usages.json",
        golden_path=test_resources_dir / "bigquery_usages_golden.json",
    )
Example #7
0
def test_data_lake_local_ingest(pytestconfig, source_file, tmp_path,
                                mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"

    f = open(os.path.join(SOURCE_FILES_PATH, source_file))
    source = json.load(f)

    config_dict = {}
    source["config"]["path_spec"]["include"] = source["config"]["path_spec"][
        "include"].replace("s3://my-test-bucket/",
                           "tests/integration/s3/test_data/local_system/")
    source["config"]["profiling"]["enabled"] = True
    source["config"].pop("aws_config")
    config_dict["source"] = source
    config_dict["sink"] = {
        "type": "file",
        "config": {
            "filename": f"{tmp_path}/{source_file}",
        },
    }

    config_dict["run_id"] = source_file

    pipeline = Pipeline.create(config_dict)
    pipeline.run()
    pipeline.raise_from_status()

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{source_file}",
        golden_path=
        f"{test_resources_dir}/golden-files/local/golden_mces_{source_file}",
    )
Example #8
0
def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka"

    with docker_compose_runner(
        test_resources_dir / "docker-compose.yml", "kafka"
    ) as docker_services:

        wait_for_port(docker_services, "test_broker", 59092, timeout=120)
        wait_for_port(docker_services, "test_schema_registry", 8081, timeout=120)

        # Set up topics and produce some data
        command = f"{test_resources_dir}/send_records.sh {test_resources_dir}"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        config_file = (test_resources_dir / "kafka_to_file.yml").resolve()
        run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "kafka_mces.json",
            golden_path=test_resources_dir / "kafka_mces_golden.json",
            ignore_paths=[],
        )
Example #9
0
def test_serde_to_json(pytestconfig: PytestConfig, tmp_path: pathlib.Path,
                       json_filename: str) -> None:
    golden_file = pytestconfig.rootpath / json_filename

    output_filename = "output.json"
    output_file = tmp_path / output_filename

    pipeline = Pipeline.create({
        "source": {
            "type": "file",
            "config": {
                "filename": str(golden_file)
            }
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": str(output_file)
            }
        },
        "run_id": "serde_test",
    })
    pipeline.run()
    pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{output_filename}",
        golden_path=golden_file,
    )
Example #10
0
def test_data_lake_s3_ingest(pytestconfig, s3_populate, source_file, tmp_path,
                             mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"

    f = open(os.path.join(SOURCE_FILES_PATH, source_file))
    source = json.load(f)

    config_dict = {}
    config_dict["source"] = source
    config_dict["sink"] = {
        "type": "file",
        "config": {
            "filename": f"{tmp_path}/{source_file}",
        },
    }

    config_dict["run_id"] = source_file

    pipeline = Pipeline.create(config_dict)
    pipeline.run()
    pipeline.raise_from_status()

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{source_file}",
        golden_path=
        f"{test_resources_dir}/golden-files/s3/golden_mces_{source_file}",
    )
Example #11
0
def test_trino_ingest(docker_compose_runner, pytestconfig, tmp_path,
                      mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/trino"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "trino") as docker_services:
        wait_for_port(docker_services, "testtrino", 8080)
        wait_for_port(docker_services, "testhiveserver2", 10000, timeout=120)

        docker_services.wait_until_responsive(
            timeout=30,
            pause=1,
            check=lambda: requests.get("http://localhost:5300/v1/info").json()[
                "starting"] is False,
        )

        # Set up the hive db
        command = "docker exec testhiveserver2 /opt/hive/bin/beeline -u jdbc:hive2://localhost:10000 -f /hive_setup.sql"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            print(tmp_path)

            # Run the metadata ingestion pipeline for trino catalog referring to postgres database
            config_file = (test_resources_dir / "trino_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert_result_ok(result)

            # Verify the output.
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="trino_mces.json",
                golden_path=test_resources_dir / "trino_mces_golden.json",
            )

            # Limitation 1  - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database.
            # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070

            # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino.
            # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html

            # Run the metadata ingestion pipeline for trino catalog referring to hive database
            config_file = (test_resources_dir /
                           "trino_hive_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert_result_ok(result)

            # Verify the output.
            mce_helpers.check_golden_file(
                pytestconfig,
                output_path="trino_hive_mces.json",
                golden_path=test_resources_dir / "trino_hive_mces_golden.json",
                ignore_paths=[
                    r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
                    r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
                    r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
                ],
            )
Example #12
0
def test_lookml_ingest(pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"

    pipeline = Pipeline.create({
        "run_id": "lookml-test",
        "source": {
            "type": "lookml",
            "config": {
                "base_folder": str(test_resources_dir / "lkml_samples"),
                "connection_to_platform_map": {
                    "my_connection": "conn"
                },
                "parse_table_names_from_sql": True,
            },
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": f"{tmp_path}/lookml_mces.json",
            },
        },
    })
    pipeline.run()
    pipeline.pretty_print_summary()
    pipeline.raise_from_status(raise_warnings=True)

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "lookml_mces.json",
        golden_path=test_resources_dir / "expected_output.json",
    )
Example #13
0
def test_kafka_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/kafka"

    with docker_compose_runner(
        test_resources_dir / "docker-compose.yml", "kafka"
    ) as docker_services:

        wait_for_port(docker_services, "test_broker", 9092, timeout=120)

        # Set up topics and produce some data
        command = f"{test_resources_dir}/send_records.sh {test_resources_dir}"
        subprocess.run(command, shell=True, check=True)

        # Run the metadata ingestion pipeline.
        runner = CliRunner()
        with fs_helpers.isolated_filesystem(tmp_path):
            config_file = (test_resources_dir / "kafka_to_file.yml").resolve()
            result = runner.invoke(datahub, ["ingest", "-c", f"{config_file}"])
            assert result.exit_code == 0

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "kafka_mces.json",
            golden_path=test_resources_dir / "kafka_mces_golden.json",
            ignore_paths=[],
        )
Example #14
0
def test_dbt_ingest(pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt"

    # test manifest, catalog, sources are generated from https://github.com/kevinhu/sample-dbt
    pipeline = Pipeline.create({
        "run_id": "dbt-test",
        "source": {
            "type": "dbt",
            "config": {
                "manifest_path": f"{test_resources_dir}/dbt_manifest.json",
                "catalog_path": f"{test_resources_dir}/dbt_catalog.json",
                "sources_path": f"{test_resources_dir}/dbt_sources.json",
                "target_platform": "dbt",
                "load_schemas": True,
            },
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": f"{tmp_path}/dbt_mces.json",
            },
        },
    })
    pipeline.run()
    pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "dbt_mces.json",
        golden_path=test_resources_dir / "dbt_mces_golden.json",
    )
Example #15
0
def test_ge_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time,
                   **kwargs):

    test_resources_dir = pytestconfig.rootpath / "tests/integration/great-expectations"

    with docker_compose_runner(
            test_resources_dir / "docker-compose.yml",
            "great-expectations") as docker_services, mock.patch(
                "datahub.emitter.rest_emitter.DatahubRestEmitter.emit_mcp"
            ) as mock_emit_mcp:
        wait_for_port(docker_services, "ge_postgres", 5432)

        emitter = MockDatahubEmitter("")
        mock_emit_mcp.side_effect = emitter.emit_mcp

        shutil.copytree(
            test_resources_dir / "setup/great_expectations",
            tmp_path / "great_expectations",
        )
        context = ge.DataContext.create(tmp_path)
        context.run_checkpoint(checkpoint_name="test_checkpoint")

        emitter.write_to_file(tmp_path / "ge_mcps.json")

        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "ge_mcps.json",
            golden_path=test_resources_dir / "ge_mcps_golden.json",
            ignore_paths=[],
        )
Example #16
0
def test_azure_ad_source_nested_groups(pytestconfig, tmp_path):

    test_resources_dir: pathlib.Path = (pytestconfig.rootpath /
                                        "tests/integration/azure_ad")

    with patch(
            "datahub.ingestion.source.identity.azure_ad.AzureADSource.get_token"
    ) as mock_token, patch(
            "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_users"
    ) as mock_users, patch(
            "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_groups"
    ) as mock_groups, patch(
            "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_group_members"
    ) as mock_group_users:
        mocked_functions(
            test_resources_dir,
            mock_token,
            mock_users,
            mock_groups,
            mock_group_users,
            True,
        )
        # Run an azure usage ingestion run.
        pipeline = Pipeline.create({
            "run_id": "test-azure-ad",
            "source": {
                "type": "azure-ad",
                "config": {
                    "client_id": "00000000-0000-0000-0000-000000000000",
                    "tenant_id": "00000000-0000-0000-0000-000000000000",
                    "client_secret": "client_secret",
                    "redirect":
                    "https://login.microsoftonline.com/common/oauth2/nativeclient",
                    "authority":
                    "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000",
                    "token_url":
                    "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000/oauth2/token",
                    "graph_url": "https://graph.microsoft.com/v1.0",
                    "ingest_group_membership": True,
                    "ingest_groups": True,
                    "ingest_users": False,
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/azure_ad_mces_nested_groups.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "azure_ad_mces_nested_groups.json",
        golden_path=test_resources_dir /
        "azure_ad_mces_golden_nested_groups.json",
    )
Example #17
0
def test_looker_ingest(pytestconfig, tmp_path, mock_time):
    mocked_client = mock.MagicMock()
    with mock.patch(
            "datahub.ingestion.source.looker.LookerDashboardSource._get_looker_client",
            mocked_client,
    ):
        mocked_client.return_value.all_dashboards.return_value = [
            Dashboard(id="1")
        ]
        mocked_client.return_value.dashboard.return_value = Dashboard(
            id="1",
            title="foo",
            created_at=datetime.utcfromtimestamp(time.time()),
            description="lorem ipsum",
            dashboard_elements=[
                DashboardElement(
                    id="2",
                    type="",
                    subtitle_text="Some text",
                    query=Query(
                        model="data",
                        view="my_view",
                        dynamic_fields=
                        '[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]',
                    ),
                )
            ],
        )

        test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"

        pipeline = Pipeline.create({
            "run_id": "looker-test",
            "source": {
                "type": "looker",
                "config": {
                    "base_url": "https://looker.company.com",
                    "client_id": "foo",
                    "client_secret": "bar",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/looker_mces.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "looker_mces.json",
            golden_path=test_resources_dir / "expected_output.json",
        )
Example #18
0
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path,
                      mock_time):

    # Run the metadata ingestion pipeline.
    with fs_helpers.isolated_filesystem(tmp_path):

        # Run the metadata ingestion pipeline for trino catalog referring to postgres database
        mce_out_file = "trino_mces.json"
        events_file = tmp_path / mce_out_file

        pipeline_config = {
            "run_id": "trino-test",
            "source": {
                "type":
                data_platform,
                "config":
                TrinoConfig(
                    host_port="localhost:5300",
                    database="postgresqldb",
                    database_alias="library_catalog",
                    username="******",
                    schema_pattern=AllowDenyPattern(allow=["^librarydb"]),
                    profile_pattern=AllowDenyPattern(
                        allow=["library_catalog.librarydb.*"]),
                    profiling=GEProfilingConfig(
                        enabled=True,
                        include_field_null_count=True,
                        include_field_min_value=True,
                        include_field_max_value=True,
                        include_field_mean_value=True,
                        include_field_median_value=True,
                        include_field_stddev_value=True,
                        include_field_quantiles=True,
                        include_field_distinct_value_frequencies=True,
                        include_field_histogram=True,
                        include_field_sample_values=True,
                    ),
                ).dict(),
            },
            "sink": {
                "type": "file",
                "config": FileSinkConfig(filename=str(events_file)).dict(),
            },
        }

        # Run the metadata ingestion pipeline.
        pipeline = Pipeline.create(pipeline_config)
        pipeline.run()
        pipeline.pretty_print_summary()
        pipeline.raise_from_status(raise_warnings=True)
        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path="trino_mces.json",
            golden_path=test_resources_dir / "trino_mces_golden.json",
        )
Example #19
0
def test_tableau_ingest(pytestconfig, tmp_path):

    global test_resources_dir
    test_resources_dir = pathlib.Path(
        pytestconfig.rootpath / "tests/integration/tableau"
    )

    with mock.patch("tableauserverclient.Server") as mock_sdk:
        mock_client = mock.Mock()
        mocked_metadata = mock.Mock()
        mocked_metadata.query.side_effect = side_effect_query_metadata
        mock_client.metadata = mocked_metadata
        mock_client.auth = mock.Mock()
        mock_client.auth.sign_in.return_value = None
        mock_client.auth.sign_out.return_value = None
        mock_sdk.return_value = mock_client
        mock_sdk._auth_token = "ABC"

        pipeline = Pipeline.create(
            {
                "run_id": "tableau-test",
                "source": {
                    "type": "tableau",
                    "config": {
                        "username": "******",
                        "password": "******",
                        "connect_uri": "https://do-not-connect",
                        "site": "acryl",
                        "projects": ["default", "Project 2"],
                        "ingest_tags": True,
                        "ingest_owner": True,
                        "default_schema_map": {
                            "dvdrental": "public",
                            "someotherdb": "schema",
                        },
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": f"{tmp_path}/tableau_mces.json",
                    },
                },
            }
        )
        pipeline.run()
        pipeline.raise_from_status()

        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=f"{tmp_path}/tableau_mces.json",
            golden_path=test_resources_dir / "tableau_mces_golden.json",
            ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS,
        )
Example #20
0
def test_trino_hive_ingest(loaded_trino, test_resources_dir, pytestconfig,
                           tmp_path, mock_time):

    # Run the metadata ingestion pipeline for trino catalog referring to postgres database
    mce_out_file = "trino_hive_mces.json"
    events_file = tmp_path / mce_out_file

    pipeline_config = {
        "run_id": "trino-hive-test",
        "source": {
            "type":
            data_platform,
            "config":
            TrinoConfig(
                host_port="localhost:5300",
                database="hivedb",
                username="******",
                schema_pattern=AllowDenyPattern(allow=["^db1"]),
            ).dict(),
        },
        "sink": {
            "type": "file",
            "config": FileSinkConfig(filename=str(events_file)).dict(),
        },
    }

    # Run the metadata ingestion pipeline.
    pipeline = Pipeline.create(pipeline_config)
    pipeline.run()
    pipeline.pretty_print_summary()
    pipeline.raise_from_status(raise_warnings=True)

    # Limitation 1  - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database.
    # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070
    # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino.
    # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html

    # Run the metadata ingestion pipeline for trino catalog referring to hive database
    # config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve()
    # run_datahub_cmd(["ingest", "-c", f"{config_file}"])

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=events_file,
        golden_path=test_resources_dir / "trino_hive_mces_golden.json",
        ignore_paths=[
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
        ],
    )
Example #21
0
def test_openapi_ingest(pytestconfig, tmp_path):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/openapi"

    # Run the metadata ingestion pipeline.
    config_file = (test_resources_dir / "openapi_to_file.yml").resolve()
    run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path="/tmp/openapi_mces.json",
        golden_path=test_resources_dir / "openapi_mces_golden.json",
    )
Example #22
0
def test_okta_source_include_deprovisioned_suspended_users(
        pytestconfig, tmp_path):

    test_resources_dir: pathlib.Path = pytestconfig.rootpath / "tests/integration/okta"

    with patch(
            "datahub.ingestion.source.identity.okta.OktaClient") as MockClient:

        _init_mock_okta_client(test_resources_dir, MockClient)

        # Run an Okta usage ingestion run.
        pipeline = Pipeline.create({
            "run_id": "test-okta-usage",
            "source": {
                "type": "okta",
                "config": {
                    "okta_domain": "mock-domain.okta.com",
                    "okta_api_token": "mock-okta-token",
                    "ingest_users": "True",
                    "ingest_groups": "True",
                    "ingest_group_membership": "True",
                    "okta_profile_to_username_attr": "login",
                    "okta_profile_to_username_regex": "([^@]+)",
                    "okta_profile_to_group_name_attr": "name",
                    "okta_profile_to_group_name_regex": "(.*)",
                    "include_deprovisioned_users": "True",
                    "include_suspended_users": "True",
                    "page_size": "2",
                    "delay_seconds": "0.00",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename":
                    f"{tmp_path}/okta_mces_include_deprovisioned_suspended_users.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path /
        "okta_mces_include_deprovisioned_suspended_users.json",
        golden_path=test_resources_dir /
        "okta_mces_golden_include_deprovisioned_suspended_users.json",
    )
Example #23
0
def test_redshift_usage_filtering(pytestconfig, tmp_path):

    test_resources_dir = pathlib.Path(pytestconfig.rootpath /
                                      "tests/integration/redshift-usage")

    with patch("datahub.ingestion.source.usage.redshift_usage.Engine.execute"
               ) as mock_engine_execute:
        access_events = load_access_events(test_resources_dir)
        mock_engine_execute.return_value = access_events

        # Run ingestion
        pipeline = Pipeline.create(
            {
                "run_id": "test-redshift-usage",
                "source": {
                    "type": "redshift-usage",
                    "config": {
                        "host_port": "xxxxx",
                        "database": "xxxxx",
                        "username": "******",
                        "password": "******",
                        "email_domain": "acryl.io",
                        "include_views": True,
                        "include_tables": True,
                        "schema_pattern": {
                            "allow": ["public"]
                        },
                        "table_pattern": {
                            "deny": ["orders"]
                        },
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": f"{tmp_path}/redshift_usages.json"
                    },
                },
            }, )
        pipeline.run()
        pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig=pytestconfig,
        output_path=tmp_path / "redshift_usages.json",
        golden_path=test_resources_dir /
        "redshift_usages_filtered_golden.json",
    )
def test_sagemaker_ingest(tmp_path, pytestconfig):

    sagemaker_source_instance = sagemaker_source()

    with Stubber(sagemaker_source_instance.sagemaker_client) as sagemaker_stubber:

        sagemaker_stubber.add_response(
            "list_feature_groups",
            list_feature_groups_response,
            {},
        )
        sagemaker_stubber.add_response(
            "describe_feature_group",
            describe_feature_group_response_1,
            {
                "FeatureGroupName": "test-2",
            },
        )
        sagemaker_stubber.add_response(
            "describe_feature_group",
            describe_feature_group_response_2,
            {
                "FeatureGroupName": "test-1",
            },
        )
        sagemaker_stubber.add_response(
            "describe_feature_group",
            describe_feature_group_response_3,
            {
                "FeatureGroupName": "test",
            },
        )

        mce_objects = [
            wu.mce.to_obj() for wu in sagemaker_source_instance.get_workunits()
        ]

        with open(str(tmp_path / "sagemaker_mces.json"), "w") as f:
            json.dump(mce_objects, f, indent=2)

    # Verify the output.
    test_resources_dir = pytestconfig.rootpath / "tests/unit/sagemaker"
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "sagemaker_mces.json",
        golden_path=test_resources_dir / "sagemaker_mces_golden.json",
    )
Example #25
0
def test_redshift_usage_source(pytestconfig, tmp_path):

    test_resources_dir = pathlib.Path(pytestconfig.rootpath /
                                      "tests/integration/redshift-usage")

    with patch("datahub.ingestion.source.usage.redshift_usage.Engine.execute"
               ) as mock_engine_execute:
        raw_access_events: List[Dict] = load_access_events(test_resources_dir)
        mock_engine_execute.return_value = raw_access_events

        # Run ingestion
        pipeline = Pipeline.create(
            {
                "run_id": "test-redshift-usage",
                "source": {
                    "type": "redshift-usage",
                    "config": {
                        "host_port": "xxxxx",
                        "database": "xxxxx",
                        "username": "******",
                        "password": "******",
                        "email_domain": "acryl.io",
                        "include_views": True,
                        "include_tables": True,
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": f"{tmp_path}/redshift_usages.json"
                    },
                },
            }, )
        pipeline.run()
        pipeline.raise_from_status()

    # There should be 2 calls (usage aspects -1, operation aspects -1).
    assert mock_engine_execute.call_count == 2
    source_report: RedshiftUsageSourceReport = cast(
        RedshiftUsageSourceReport, pipeline.source.get_report())
    assert source_report.num_usage_workunits_emitted == 3
    assert source_report.num_operational_stats_workunits_emitted == 3
    mce_helpers.check_golden_file(
        pytestconfig=pytestconfig,
        output_path=tmp_path / "redshift_usages.json",
        golden_path=test_resources_dir / "redshift_usages_golden.json",
    )
Example #26
0
def ingestion_test(
    pytestconfig: Any,
    tmp_path: pathlib.Path,
    mock_time: int,
    mock_connection: DBConnection,
) -> None:  # noqa : No need for type annotations here
    test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
    mce_out_file = f"lookml_mces_api_{mock_connection.dialect_name}.json"
    mocked_client = mock.MagicMock()
    mock_model = mock.MagicMock(project_name="lkml_samples")
    with mock.patch("looker_sdk.init31") as mock_sdk:
        mock_sdk.return_value = mocked_client
        # mock_connection = mock.MagicMock()
        mocked_client.connection.return_value = mock_connection
        mocked_client.lookml_model.return_value = mock_model

        pipeline = Pipeline.create({
            "run_id": "lookml-test",
            "source": {
                "type": "lookml",
                "config": {
                    "base_folder": str(test_resources_dir / "lkml_samples"),
                    "api": {
                        "client_id": "fake_client_id",
                        "client_secret": "fake_secret",
                        "base_url": "fake_account.looker.com",
                    },
                    "parse_table_names_from_sql": True,
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/{mce_out_file}",
                },
            },
        })
        pipeline.run()
        pipeline.pretty_print_summary()
        pipeline.raise_from_status(raise_warnings=True)

        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / mce_out_file,
            golden_path=test_resources_dir / mce_out_file,
        )
Example #27
0
def test_data_lake_ingest(pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/data_lake/"

    # Run the metadata ingestion pipeline.
    pipeline = Pipeline.create({
        "run_id": "data-lake-test",
        "source": {
            "type": "data-lake",
            "config": {
                "base_path": str(test_resources_dir / "test_data"),
                "use_relative_path":
                True,  # should be enabled for testing since full paths will differ on different machines
                "path_spec": "./{name[0]}/{name[1]}.{format}",
                "platform": "data-lake-test",
                "profiling": {
                    "enabled": True,
                    "profile_table_level_only": False,
                    "include_field_min_value": True,
                    "include_field_max_value": True,
                    "include_field_mean_value": True,
                    "include_field_median_value": True,
                    "include_field_stddev_value": True,
                    "include_field_quantiles": True,
                    "include_field_distinct_value_frequencies": True,
                    "include_field_histogram": True,
                    "include_field_sample_values": True,
                },
            },
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": f"{tmp_path}/data_lake_mces.json",
            },
        },
    })
    pipeline.run()
    pipeline.raise_from_status()

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "data_lake_mces.json",
        golden_path=test_resources_dir / "data_lake_mces_golden.json",
    )
Example #28
0
def test_trino_usage_source(pytestconfig, tmp_path):

    test_resources_dir = pathlib.Path(
        pytestconfig.rootpath / "tests/integration/starburst-trino-usage")

    with patch(
            "datahub.ingestion.source.usage.starburst_trino_usage.TrinoUsageSource._get_trino_history"
    ) as mock_event_history:
        access_events = load_access_events(test_resources_dir)
        mock_event_history.return_value = access_events

        # Run ingestion
        pipeline = Pipeline.create(
            {
                "run_id": "test-trino-usage",
                "source": {
                    "type": "starburst-trino-usage",
                    "config": {
                        "host_port": "xxxxx",
                        "database": "testcatalog",
                        "username": "******",
                        "password": "******",
                        "audit_catalog": "test",
                        "audit_schema": "test",
                        "email_domain": "acryl.io",
                        "include_views": True,
                        "include_tables": True,
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": f"{tmp_path}/trino_usages.json"
                    },
                },
            }, )
        pipeline.run()
        pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig=pytestconfig,
        output_path=tmp_path / "trino_usages.json",
        golden_path=test_resources_dir / "trino_usages_golden.json",
    )
Example #29
0
def test_dbt_ingest(pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt"

    config_variants = [
        DbtTestConfig(
            "dbt-test-with-schemas",
            test_resources_dir,
            tmp_path,
            "dbt_with_schemas_mces.json",
            "dbt_with_schemas_mces_golden.json",
            source_config_modifiers={"load_schemas": True},
        ),
        DbtTestConfig(
            "dbt-test-without-schemas",
            test_resources_dir,
            tmp_path,
            "dbt_without_schemas_mces.json",
            "dbt_without_schemas_mces_golden.json",
            source_config_modifiers={"load_schemas": False},
        ),
    ]

    for config in config_variants:

        # test manifest, catalog, sources are generated from https://github.com/kevinhu/sample-dbt
        pipeline = Pipeline.create({
            "run_id": config.run_id,
            "source": {
                "type": "dbt",
                "config": config.source_config
            },
            "sink": {
                "type": "file",
                "config": config.sink_config,
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=config.output_path,
            golden_path=config.golden_path,
        )
Example #30
0
def test_lookml_bad_sql_parser(pytestconfig, tmp_path, mock_time):
    """Incorrect specification of sql parser should not fail ingestion"""
    test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"
    mce_out = "lookml_mces_badsql_parser.json"
    pipeline = Pipeline.create({
        "run_id": "lookml-test",
        "source": {
            "type": "lookml",
            "config": {
                "base_folder": str(test_resources_dir / "lkml_samples"),
                "connection_to_platform_map": {
                    "my_connection": {
                        "platform": "snowflake",
                        "default_db": "default_db",
                        "default_schema": "default_schema",
                    }
                },
                "parse_table_names_from_sql": True,
                "project_name": "lkml_samples",
                "sql_parser": "bad.sql.Parser",
            },
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": f"{tmp_path}/{mce_out}",
            },
        },
    })
    pipeline.run()
    pipeline.pretty_print_summary()
    pipeline.raise_from_status(raise_warnings=False)
    try:
        pipeline.raise_from_status(raise_warnings=True)
        assert False, "Pipeline should have generated warnings"
    except PipelineExecutionError:
        pass

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / mce_out,
        golden_path=test_resources_dir / mce_out,
    )