def test_trino_instance_ingest(loaded_trino, test_resources_dir, pytestconfig,
                               tmp_path, mock_time):
    instance = "production_warehouse"
    platform = "trino"
    mce_out_file = "trino_instance_mces.json"
    events_file = tmp_path / mce_out_file
    pipeline_config = {
        "run_id": "trino-hive-instance-test",
        "source": {
            "type":
            data_platform,
            "config":
            TrinoConfig(
                host_port="localhost:5300",
                database="hivedb",
                username="******",
                platform_instance="production_warehouse",
                schema_pattern=AllowDenyPattern(allow=["^db1"]),
            ).dict(),
        },
        "sink": {
            "type": "file",
            "config": FileSinkConfig(filename=str(events_file)).dict(),
        },
    }

    # Run the metadata ingestion pipeline.
    pipeline = Pipeline.create(pipeline_config)
    pipeline.run()
    pipeline.pretty_print_summary()
    pipeline.raise_from_status(raise_warnings=True)

    # Assert that all events generated have instance specific urns
    urn_pattern = "^" + re.escape(
        f"urn:li:dataset:(urn:li:dataPlatform:{platform},{instance}.")
    assert (mce_helpers.assert_mce_entity_urn(
        "ALL",
        entity_type="dataset",
        regex_pattern=urn_pattern,
        file=events_file,
    ) >= 0), "There should be at least one match"

    assert (mce_helpers.assert_mcp_entity_urn(
        "ALL",
        entity_type="dataset",
        regex_pattern=urn_pattern,
        file=events_file,
    ) >= 0), "There should be at least one MCP"

    # all dataset entities emitted must have a dataPlatformInstance aspect emitted
    # there must be at least one entity emitted
    assert (mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="dataPlatformInstance",
        aspect_field_matcher={
            "instance":
            f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:{platform},{instance})"
        },
        file=events_file,
    ) >= 1)
Exemple #2
0
    def test_pipeline_process_commits(self, commit_policy, source, should_commit):
        pipeline = Pipeline.create(
            {
                "source": {"type": f"tests.unit.test_pipeline.{source}"},
                "sink": {"type": "console"},
                "run_id": "pipeline_test",
            }
        )

        class FakeCommittable(Committable):
            def __init__(self, commit_policy: CommitPolicy):
                self.name = "test_checkpointer"
                self.commit_policy = commit_policy

            def commit(self) -> None:
                pass

        fake_committable: Committable = FakeCommittable(commit_policy)

        with patch.object(
            FakeCommittable, "commit", wraps=fake_committable.commit
        ) as mock_commit:
            pipeline.ctx.register_reporter(fake_committable)

            pipeline.run()
            # check that we called the commit method once only if should_commit is True
            if should_commit:
                mock_commit.assert_called_once()
            else:
                mock_commit.assert_not_called()
def test_dbt_ingest(pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt"

    # test manifest, catalog, sources are generated from https://github.com/kevinhu/sample-dbt
    pipeline = Pipeline.create({
        "run_id": "dbt-test",
        "source": {
            "type": "dbt",
            "config": {
                "manifest_path": f"{test_resources_dir}/dbt_manifest.json",
                "catalog_path": f"{test_resources_dir}/dbt_catalog.json",
                "sources_path": f"{test_resources_dir}/dbt_sources.json",
                "target_platform": "dbt",
                "load_schemas": True,
            },
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": f"{tmp_path}/dbt_mces.json",
            },
        },
    })
    pipeline.run()
    pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "dbt_mces.json",
        golden_path=test_resources_dir / "dbt_mces_golden.json",
    )
Exemple #4
0
def test_serde_large(pytestconfig, tmp_path):
    json_filename = "test_serde_large.json"
    output_filename = "output.json"

    test_resources_dir = pytestconfig.rootpath / "tests/unit/serde"

    golden_file = test_resources_dir / json_filename
    output_file = tmp_path / output_filename

    pipeline = Pipeline.create({
        "source": {
            "type": "file",
            "config": {
                "filename": str(golden_file)
            }
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": str(output_file)
            }
        },
    })
    pipeline.run()
    pipeline.raise_from_status()

    output = mce_helpers.load_json_file(tmp_path / output_filename)
    golden = mce_helpers.load_json_file(golden_file)
    mce_helpers.assert_mces_equal(output, golden)
Exemple #5
0
def test_data_lake_s3_ingest(pytestconfig, s3_populate, source_file, tmp_path,
                             mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"

    f = open(os.path.join(SOURCE_FILES_PATH, source_file))
    source = json.load(f)

    config_dict = {}
    config_dict["source"] = source
    config_dict["sink"] = {
        "type": "file",
        "config": {
            "filename": f"{tmp_path}/{source_file}",
        },
    }

    config_dict["run_id"] = source_file

    pipeline = Pipeline.create(config_dict)
    pipeline.run()
    pipeline.raise_from_status()

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{source_file}",
        golden_path=
        f"{test_resources_dir}/golden-files/s3/golden_mces_{source_file}",
    )
def create_and_run_test_pipeline(
    events: List[Union[MetadataChangeEventClass,
                       MetadataChangeProposalWrapper]],
    transformers: List[Dict[str, Any]],
    path: str,
) -> str:
    with mock.patch("tests.unit.test_source.TestSource.get_workunits"
                    ) as mock_getworkunits:
        mock_getworkunits.return_value = [
            workunit.MetadataWorkUnit(
                id=f"test-workunit-mce-{e.proposedSnapshot.urn}", mce=e)
            if isinstance(e, MetadataChangeEventClass) else
            workunit.MetadataWorkUnit(
                id=f"test-workunit-mcp-{e.entityUrn}-{e.aspectName}", mcp=e)
            for e in events
        ]
        events_file = f"{path}/{str(uuid4())}.json"
        pipeline = Pipeline.create(
            config_dict={
                "source": {
                    "type": "tests.unit.test_source.TestSource",
                    "config": {},
                },
                "transformers": transformers,
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": events_file
                    }
                },
            })

        pipeline.run()
        pipeline.raise_from_status()
    return events_file
Exemple #7
0
def test_ldap_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/ldap"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "ldap") as docker_services:
        # The openldap container loads the sample data after exposing the port publicly. As such,
        # we must wait a little bit extra to ensure that the sample data is loaded.
        wait_for_port(docker_services, "openldap", 389)
        time.sleep(5)

        pipeline = Pipeline.create({
            "run_id": "ldap-test",
            "source": {
                "type": "ldap",
                "config": {
                    "ldap_server": "ldap://localhost",
                    "ldap_user": "******",
                    "ldap_password": "******",
                    "base_dn": "dc=example,dc=org",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/ldap_mces.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        output = mce_helpers.load_json_file(str(tmp_path / "ldap_mces.json"))
        golden = mce_helpers.load_json_file(
            str(test_resources_dir / "ldap_mces_golden.json"))
        mce_helpers.assert_mces_equal(output, golden)
Exemple #8
0
def test_serde_to_json(pytestconfig: PytestConfig, tmp_path: pathlib.Path,
                       json_filename: str) -> None:
    golden_file = pytestconfig.rootpath / json_filename

    output_filename = "output.json"
    output_file = tmp_path / output_filename

    pipeline = Pipeline.create({
        "source": {
            "type": "file",
            "config": {
                "filename": str(golden_file)
            }
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": str(output_file)
            }
        },
        "run_id": "serde_test",
    })
    pipeline.run()
    pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{output_filename}",
        golden_path=golden_file,
    )
Exemple #9
0
def test_data_lake_local_ingest(pytestconfig, source_file, tmp_path,
                                mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"

    f = open(os.path.join(SOURCE_FILES_PATH, source_file))
    source = json.load(f)

    config_dict = {}
    source["config"]["path_spec"]["include"] = source["config"]["path_spec"][
        "include"].replace("s3://my-test-bucket/",
                           "tests/integration/s3/test_data/local_system/")
    source["config"]["profiling"]["enabled"] = True
    source["config"].pop("aws_config")
    config_dict["source"] = source
    config_dict["sink"] = {
        "type": "file",
        "config": {
            "filename": f"{tmp_path}/{source_file}",
        },
    }

    config_dict["run_id"] = source_file

    pipeline = Pipeline.create(config_dict)
    pipeline.run()
    pipeline.raise_from_status()

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{source_file}",
        golden_path=
        f"{test_resources_dir}/golden-files/local/golden_mces_{source_file}",
    )
Exemple #10
0
def test_dbt_ingest(pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/dbt"

    pipeline = Pipeline.create(
        {
            "run_id": "dbt-test",
            "source": {
                "type": "dbt",
                "config": {
                    "manifest_path": f"{test_resources_dir}/dbt_manifest.json",
                    "catalog_path": f"{test_resources_dir}/dbt_catalog.json",
                    "target_platform": "dbt",
                    "load_schemas": True,
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/dbt_mces.json",
                },
            },
        }
    )
    pipeline.run()
    pipeline.raise_from_status()

    output = mce_helpers.load_json_file(str(tmp_path / "dbt_mces.json"))
    golden = mce_helpers.load_json_file(
        str(test_resources_dir / "dbt_mces_golden.json")
    )
    mce_helpers.assert_mces_equal(output, golden)
    def test_run_including_fake_transformation(self):

        pipeline = Pipeline.create({
            "source": {
                "type": "tests.unit.test_pipeline.FakeSource"
            },
            "transformers": [{
                "type":
                "tests.unit.test_pipeline.AddStatusRemovedTransformer"
            }],
            "sink": {
                "type": "tests.test_helpers.sink_helpers.RecordingSink"
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        expected_mce = get_initial_mce()

        dataset_snapshot = cast(DatasetSnapshotClass,
                                expected_mce.proposedSnapshot)
        dataset_snapshot.aspects.append(get_status_removed_aspect())

        sink_report: RecordingSinkReport = cast(RecordingSinkReport,
                                                pipeline.sink.get_report())

        self.assertEqual(len(sink_report.received_records), 1)
        self.assertEqual(expected_mce, sink_report.received_records[0].record)
Exemple #12
0
 def test_configure_with_rest_sink_initializes_graph(
     self, mock_source, mock_test_connection
 ):
     pipeline = Pipeline.create(
         {
             "source": {
                 "type": "file",
                 "config": {"filename": "test_events.json"},
             },
             "sink": {
                 "type": "datahub-rest",
                 "config": {
                     "server": "http://somehost.someplace.some:8080",
                     "token": "foo",
                 },
             },
         }
     )
     # assert that the default sink config is for a DatahubRestSink
     assert isinstance(pipeline.config.sink, DynamicTypedConfig)
     assert pipeline.config.sink.type == "datahub-rest"
     assert pipeline.config.sink.config == {
         "server": "http://somehost.someplace.some:8080",
         "token": "foo",
     }
     assert pipeline.ctx.graph is not None, "DataHubGraph should be initialized"
     assert pipeline.ctx.graph.config.server == pipeline.config.sink.config["server"]
     assert pipeline.ctx.graph.config.token == pipeline.config.sink.config["token"]
Exemple #13
0
def test_lookml_ingest(pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/lookml"

    pipeline = Pipeline.create({
        "run_id": "lookml-test",
        "source": {
            "type": "lookml",
            "config": {
                "base_folder": str(test_resources_dir),
                "connection_to_platform_map": {
                    "my_connection": "conn"
                },
                "parse_table_names_from_sql": True,
            },
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": f"{tmp_path}/lookml_mces.json",
            },
        },
    })
    pipeline.run()
    pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "lookml_mces.json",
        golden_path=test_resources_dir / "expected_output.json",
    )
Exemple #14
0
def test_serde_to_json(pytestconfig: PytestConfig, tmp_path: pathlib.Path,
                       json_filename: str) -> None:
    golden_file = pytestconfig.rootpath / json_filename

    output_filename = "output.json"
    output_file = tmp_path / output_filename

    pipeline = Pipeline.create({
        "source": {
            "type": "file",
            "config": {
                "filename": str(golden_file)
            }
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": str(output_file)
            }
        },
    })
    pipeline.run()
    pipeline.raise_from_status()

    output = mce_helpers.load_json_file(tmp_path / output_filename)
    golden = mce_helpers.load_json_file(golden_file)
    assert golden == output
def datahub_recipe():
    with open("path/to/recipe.yml") as config_file:
        config = yaml.safe_load(config_file)

    pipeline = Pipeline.create(config)
    pipeline.run()
    pipeline.raise_from_status()
Exemple #16
0
def test_mongodb_ingest(docker_compose_runner, pytestconfig, tmp_path,
                        mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/mongodb"

    with docker_compose_runner(test_resources_dir / "docker-compose.yml",
                               "mongo") as docker_services:
        wait_for_port(docker_services, "testmongodb", 27017)

        # Run the metadata ingestion pipeline.
        pipeline = Pipeline.create({
            "run_id": "mongodb-test",
            "source": {
                "type": "mongodb",
                "config": {
                    "connect_uri": "mongodb://localhost:57017",
                    "username": "******",
                    "password": "******",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/mongodb_mces.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        # Verify the output.
        output = mce_helpers.load_json_file(str(tmp_path /
                                                "mongodb_mces.json"))
        golden = mce_helpers.load_json_file(
            str(test_resources_dir / "mongodb_mce_golden.json"))
        mce_helpers.assert_mces_equal(output, golden)
Exemple #17
0
def run(
    ctx: click.Context,
    config: str,
    dry_run: bool,
    preview: bool,
    strict_warnings: bool,
    preview_workunits: int,
) -> None:
    """Ingest metadata into DataHub."""

    logger.info("DataHub CLI version: %s", datahub_package.nice_version_name())

    config_file = pathlib.Path(config)
    pipeline_config = load_config_file(config_file)

    try:
        logger.debug(f"Using config: {pipeline_config}")
        pipeline = Pipeline.create(pipeline_config, dry_run, preview,
                                   preview_workunits)
    except ValidationError as e:
        click.echo(e, err=True)
        sys.exit(1)
    except Exception as e:
        # The pipeline_config may contain sensitive information, so we wrap the exception
        # in a SensitiveError to prevent detailed variable-level information from being logged.
        raise SensitiveError() from e

    logger.info("Starting metadata ingestion")
    pipeline.run()
    logger.info("Finished metadata ingestion")
    ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings)
    pipeline.log_ingestion_stats()
    sys.exit(ret)
Exemple #18
0
def ingest(config: str) -> None:
    """Main command for ingesting metadata into DataHub"""

    config_file = pathlib.Path(config)
    if not config_file.is_file():
        raise ConfigurationError(f"Cannot open config file {config}")

    config_mech: ConfigurationMechanism
    if config_file.suffix in [".yaml", ".yml"]:
        config_mech = YamlConfigurationMechanism()
    elif config_file.suffix == ".toml":
        config_mech = TomlConfigurationMechanism()
    else:
        raise ConfigurationError(
            "Only .toml and .yml are supported. Cannot process file type {}".format(
                config_file.suffix
            )
        )

    with config_file.open() as fp:
        pipeline_config = config_mech.load_config(fp)

    try:
        logger.info(f"Using config: {pipeline_config}")
        pipeline = Pipeline.create(pipeline_config)
    except ValidationError as e:
        click.echo(e, err=True)
        sys.exit(1)

    pipeline.run()
    ret = pipeline.pretty_print_summary()
    sys.exit(ret)
Exemple #19
0
def ingest(config: str):
    """Main command for ingesting metadata into DataHub"""

    config_file = pathlib.Path(config)
    if not config_file.is_file():
        raise ConfigurationError(f"Cannot open config file {config}")

    config_mech: ConfigurationMechanism
    if config_file.suffix in [".yaml", ".yml"]:
        config_mech = YamlConfigurationMechanism()
    elif config_file.suffix == ".toml":
        config_mech = TomlConfigurationMechanism()
    else:
        raise ConfigurationError(
            "Only .toml and .yml are supported. Cannot process file type {}".
            format(config_file.suffix))

    with config_file.open() as fp:
        pipeline_config = config_mech.load_config(fp)

    with nicely_formatted_validation_errors():
        logger.debug(f"Using config: {pipeline_config}")
        pipeline = Pipeline.create(pipeline_config)
    pipeline.run()
    pipeline.pretty_print_summary()
Exemple #20
0
def test_mongodb_ingest(mongodb, pytestconfig, tmp_path, mock_time):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/mongodb"

    pipeline = Pipeline.create({
        "run_id": "mongodb-test",
        "source": {
            "type": "mongodb",
            "config": {
                "connect_uri": "mongodb://localhost:57017",
                "username": "******",
                "password": "******",
            },
        },
        "sink": {
            "type": "file",
            "config": {
                "filename": f"{tmp_path}/mongodb_mces.json",
            },
        },
    })
    pipeline.run()
    pipeline.raise_from_status()

    output = mce_helpers.load_json_file(str(tmp_path / "mongodb_mces.json"))
    golden = mce_helpers.load_json_file(
        str(test_resources_dir / "mongodb_mce_golden.json"))
    mce_helpers.assert_mces_equal(output, golden)
Exemple #21
0
def test_azure_ad_source_nested_groups(pytestconfig, tmp_path):

    test_resources_dir: pathlib.Path = (pytestconfig.rootpath /
                                        "tests/integration/azure_ad")

    with patch(
            "datahub.ingestion.source.identity.azure_ad.AzureADSource.get_token"
    ) as mock_token, patch(
            "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_users"
    ) as mock_users, patch(
            "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_groups"
    ) as mock_groups, patch(
            "datahub.ingestion.source.identity.azure_ad.AzureADSource._get_azure_ad_group_members"
    ) as mock_group_users:
        mocked_functions(
            test_resources_dir,
            mock_token,
            mock_users,
            mock_groups,
            mock_group_users,
            True,
        )
        # Run an azure usage ingestion run.
        pipeline = Pipeline.create({
            "run_id": "test-azure-ad",
            "source": {
                "type": "azure-ad",
                "config": {
                    "client_id": "00000000-0000-0000-0000-000000000000",
                    "tenant_id": "00000000-0000-0000-0000-000000000000",
                    "client_secret": "client_secret",
                    "redirect":
                    "https://login.microsoftonline.com/common/oauth2/nativeclient",
                    "authority":
                    "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000",
                    "token_url":
                    "https://login.microsoftonline.com/00000000-0000-0000-0000-000000000000/oauth2/token",
                    "graph_url": "https://graph.microsoft.com/v1.0",
                    "ingest_group_membership": True,
                    "ingest_groups": True,
                    "ingest_users": False,
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/azure_ad_mces_nested_groups.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "azure_ad_mces_nested_groups.json",
        golden_path=test_resources_dir /
        "azure_ad_mces_golden_nested_groups.json",
    )
def test_bq_usage_source(pytestconfig, tmp_path):
    # from google.cloud.logging_v2 import ProtobufEntry

    test_resources_dir: pathlib.Path = (pytestconfig.rootpath /
                                        "tests/integration/bigquery-usage")
    bigquery_reference_logs_path = test_resources_dir / "bigquery_logs.json"

    if WRITE_REFERENCE_FILE:
        source = BigQueryUsageSource.create(
            dict(
                projects=[
                    "harshal-playground-306419",
                ],
                start_time=datetime.now(tz=timezone.utc) - timedelta(days=25),
            ),
            PipelineContext(run_id="bq-usage-test"),
        )
        entries = list(
            source._get_bigquery_log_entries(source._make_bigquery_clients()))

        entries = [entry._replace(logger=None) for entry in entries]
        log_entries = jsonpickle.encode(entries, indent=4)
        with bigquery_reference_logs_path.open("w") as logs:
            logs.write(log_entries)

    with unittest.mock.patch(
            "datahub.ingestion.source.usage.bigquery_usage.GCPLoggingClient",
            autospec=True) as MockClient:
        # Add mock BigQuery API responses.
        with bigquery_reference_logs_path.open() as logs:
            reference_logs = jsonpickle.decode(logs.read())
        MockClient().list_entries.return_value = reference_logs

        # Run a BigQuery usage ingestion run.
        pipeline = Pipeline.create({
            "run_id": "test-bigquery-usage",
            "source": {
                "type": "bigquery-usage",
                "config": {
                    "projects": ["sample-bigquery-project-1234"],
                    "start_time": "2021-01-01T00:00Z",
                    "end_time": "2021-07-01T00:00Z",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/bigquery_usages.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "bigquery_usages.json",
        golden_path=test_resources_dir / "bigquery_usages_golden.json",
    )
Exemple #23
0
 def get_current_checkpoint_from_pipeline(
         pipeline_config_dict: Dict[str, Any]) -> Optional[Checkpoint]:
     pipeline = Pipeline.create(pipeline_config_dict)
     pipeline.run()
     pipeline.raise_from_status()
     mysql_source = cast(MySQLSource, pipeline.source)
     return mysql_source.get_current_checkpoint(
         mysql_source.get_default_ingestion_job_id())
Exemple #24
0
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path,
                      mock_time):

    # Run the metadata ingestion pipeline.
    with fs_helpers.isolated_filesystem(tmp_path):

        # Run the metadata ingestion pipeline for trino catalog referring to postgres database
        mce_out_file = "trino_mces.json"
        events_file = tmp_path / mce_out_file

        pipeline_config = {
            "run_id": "trino-test",
            "source": {
                "type":
                data_platform,
                "config":
                TrinoConfig(
                    host_port="localhost:5300",
                    database="postgresqldb",
                    database_alias="library_catalog",
                    username="******",
                    schema_pattern=AllowDenyPattern(allow=["^librarydb"]),
                    profile_pattern=AllowDenyPattern(
                        allow=["library_catalog.librarydb.*"]),
                    profiling=GEProfilingConfig(
                        enabled=True,
                        include_field_null_count=True,
                        include_field_min_value=True,
                        include_field_max_value=True,
                        include_field_mean_value=True,
                        include_field_median_value=True,
                        include_field_stddev_value=True,
                        include_field_quantiles=True,
                        include_field_distinct_value_frequencies=True,
                        include_field_histogram=True,
                        include_field_sample_values=True,
                    ),
                ).dict(),
            },
            "sink": {
                "type": "file",
                "config": FileSinkConfig(filename=str(events_file)).dict(),
            },
        }

        # Run the metadata ingestion pipeline.
        pipeline = Pipeline.create(pipeline_config)
        pipeline.run()
        pipeline.pretty_print_summary()
        pipeline.raise_from_status(raise_warnings=True)
        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path="trino_mces.json",
            golden_path=test_resources_dir / "trino_mces_golden.json",
        )
Exemple #25
0
def test_looker_ingest(pytestconfig, tmp_path, mock_time):
    mocked_client = mock.MagicMock()
    with mock.patch(
            "datahub.ingestion.source.looker.LookerDashboardSource._get_looker_client",
            mocked_client,
    ):
        mocked_client.return_value.all_dashboards.return_value = [
            Dashboard(id="1")
        ]
        mocked_client.return_value.dashboard.return_value = Dashboard(
            id="1",
            title="foo",
            created_at=datetime.utcfromtimestamp(time.time()),
            description="lorem ipsum",
            dashboard_elements=[
                DashboardElement(
                    id="2",
                    type="",
                    subtitle_text="Some text",
                    query=Query(
                        model="data",
                        view="my_view",
                        dynamic_fields=
                        '[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]',
                    ),
                )
            ],
        )

        test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"

        pipeline = Pipeline.create({
            "run_id": "looker-test",
            "source": {
                "type": "looker",
                "config": {
                    "base_url": "https://looker.company.com",
                    "client_id": "foo",
                    "client_secret": "bar",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/looker_mces.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=tmp_path / "looker_mces.json",
            golden_path=test_resources_dir / "expected_output.json",
        )
Exemple #26
0
 def test_configure(self, mock_sink, mock_source):
     pipeline = Pipeline.create(
         {
             "source": {"type": "kafka", "config": {"bootstrap": "localhost:9092"}},
             "sink": {"type": "console"},
         }
     )
     pipeline.run()
     pipeline.raise_from_status()
     mock_source.assert_called_once()
     mock_sink.assert_called_once()
def test_tableau_ingest(pytestconfig, tmp_path):

    global test_resources_dir
    test_resources_dir = pathlib.Path(
        pytestconfig.rootpath / "tests/integration/tableau"
    )

    with mock.patch("tableauserverclient.Server") as mock_sdk:
        mock_client = mock.Mock()
        mocked_metadata = mock.Mock()
        mocked_metadata.query.side_effect = side_effect_query_metadata
        mock_client.metadata = mocked_metadata
        mock_client.auth = mock.Mock()
        mock_client.auth.sign_in.return_value = None
        mock_client.auth.sign_out.return_value = None
        mock_sdk.return_value = mock_client
        mock_sdk._auth_token = "ABC"

        pipeline = Pipeline.create(
            {
                "run_id": "tableau-test",
                "source": {
                    "type": "tableau",
                    "config": {
                        "username": "******",
                        "password": "******",
                        "connect_uri": "https://do-not-connect",
                        "site": "acryl",
                        "projects": ["default", "Project 2"],
                        "ingest_tags": True,
                        "ingest_owner": True,
                        "default_schema_map": {
                            "dvdrental": "public",
                            "someotherdb": "schema",
                        },
                    },
                },
                "sink": {
                    "type": "file",
                    "config": {
                        "filename": f"{tmp_path}/tableau_mces.json",
                    },
                },
            }
        )
        pipeline.run()
        pipeline.raise_from_status()

        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=f"{tmp_path}/tableau_mces.json",
            golden_path=test_resources_dir / "tableau_mces_golden.json",
            ignore_paths=mce_helpers.IGNORE_PATH_TIMESTAMPS,
        )
Exemple #28
0
def test_trino_hive_ingest(loaded_trino, test_resources_dir, pytestconfig,
                           tmp_path, mock_time):

    # Run the metadata ingestion pipeline for trino catalog referring to postgres database
    mce_out_file = "trino_hive_mces.json"
    events_file = tmp_path / mce_out_file

    pipeline_config = {
        "run_id": "trino-hive-test",
        "source": {
            "type":
            data_platform,
            "config":
            TrinoConfig(
                host_port="localhost:5300",
                database="hivedb",
                username="******",
                schema_pattern=AllowDenyPattern(allow=["^db1"]),
            ).dict(),
        },
        "sink": {
            "type": "file",
            "config": FileSinkConfig(filename=str(events_file)).dict(),
        },
    }

    # Run the metadata ingestion pipeline.
    pipeline = Pipeline.create(pipeline_config)
    pipeline.run()
    pipeline.pretty_print_summary()
    pipeline.raise_from_status(raise_warnings=True)

    # Limitation 1  - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database.
    # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070
    # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino.
    # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html

    # Run the metadata ingestion pipeline for trino catalog referring to hive database
    # config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve()
    # run_datahub_cmd(["ingest", "-c", f"{config_file}"])

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=events_file,
        golden_path=test_resources_dir / "trino_hive_mces_golden.json",
        ignore_paths=[
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
        ],
    )
Exemple #29
0
def run(
    ctx: click.Context,
    config: str,
    dry_run: bool,
    preview: bool,
    strict_warnings: bool,
    preview_workunits: int,
    suppress_error_logs: bool,
) -> None:
    """Ingest metadata into DataHub."""

    logger.info("DataHub CLI version: %s", datahub_package.nice_version_name())

    config_file = pathlib.Path(config)
    pipeline_config = load_config_file(config_file)

    try:
        logger.debug(f"Using config: {pipeline_config}")
        pipeline = Pipeline.create(pipeline_config, dry_run, preview, preview_workunits)
    except ValidationError as e:
        click.echo(e, err=True)
        sys.exit(1)
    except Exception as e:
        # The pipeline_config may contain sensitive information, so we wrap the exception
        # in a SensitiveError to prevent detailed variable-level information from being logged.
        raise SensitiveError() from e

    logger.info("Starting metadata ingestion")
    try:
        pipeline.run()
    except Exception as e:
        logger.info(
            f"Source ({pipeline.config.source.type}) report:\n{pipeline.source.get_report().as_string()}"
        )
        logger.info(
            f"Sink ({pipeline.config.sink.type}) report:\n{pipeline.sink.get_report().as_string()}"
        )
        # We dont want to log sensitive information in variables if the pipeline fails due to
        # an unexpected error. Disable printing sensitive info to logs if ingestion is running
        # with `--suppress-error-logs` flag.
        if suppress_error_logs:
            raise SensitiveError() from e
        else:
            raise e
    else:
        logger.info("Finished metadata pipeline")
        pipeline.log_ingestion_stats()
        ret = pipeline.pretty_print_summary(warnings_as_failure=strict_warnings)
        sys.exit(ret)
Exemple #30
0
def test_okta_source_include_deprovisioned_suspended_users(
        pytestconfig, tmp_path):

    test_resources_dir: pathlib.Path = pytestconfig.rootpath / "tests/integration/okta"

    with patch(
            "datahub.ingestion.source.identity.okta.OktaClient") as MockClient:

        _init_mock_okta_client(test_resources_dir, MockClient)

        # Run an Okta usage ingestion run.
        pipeline = Pipeline.create({
            "run_id": "test-okta-usage",
            "source": {
                "type": "okta",
                "config": {
                    "okta_domain": "mock-domain.okta.com",
                    "okta_api_token": "mock-okta-token",
                    "ingest_users": "True",
                    "ingest_groups": "True",
                    "ingest_group_membership": "True",
                    "okta_profile_to_username_attr": "login",
                    "okta_profile_to_username_regex": "([^@]+)",
                    "okta_profile_to_group_name_attr": "name",
                    "okta_profile_to_group_name_regex": "(.*)",
                    "include_deprovisioned_users": "True",
                    "include_suspended_users": "True",
                    "page_size": "2",
                    "delay_seconds": "0.00",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename":
                    f"{tmp_path}/okta_mces_include_deprovisioned_suspended_users.json",
                },
            },
        })
        pipeline.run()
        pipeline.raise_from_status()

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path /
        "okta_mces_include_deprovisioned_suspended_users.json",
        golden_path=test_resources_dir /
        "okta_mces_golden_include_deprovisioned_suspended_users.json",
    )