Example #1
0
def test_trino_instance_ingest(loaded_trino, test_resources_dir, pytestconfig,
                               tmp_path, mock_time):
    instance = "production_warehouse"
    platform = "trino"
    mce_out_file = "trino_instance_mces.json"
    events_file = tmp_path / mce_out_file
    pipeline_config = {
        "run_id": "trino-hive-instance-test",
        "source": {
            "type":
            data_platform,
            "config":
            TrinoConfig(
                host_port="localhost:5300",
                database="hivedb",
                username="******",
                platform_instance="production_warehouse",
                schema_pattern=AllowDenyPattern(allow=["^db1"]),
            ).dict(),
        },
        "sink": {
            "type": "file",
            "config": FileSinkConfig(filename=str(events_file)).dict(),
        },
    }

    # Run the metadata ingestion pipeline.
    pipeline = Pipeline.create(pipeline_config)
    pipeline.run()
    pipeline.pretty_print_summary()
    pipeline.raise_from_status(raise_warnings=True)

    # Assert that all events generated have instance specific urns
    urn_pattern = "^" + re.escape(
        f"urn:li:dataset:(urn:li:dataPlatform:{platform},{instance}.")
    assert (mce_helpers.assert_mce_entity_urn(
        "ALL",
        entity_type="dataset",
        regex_pattern=urn_pattern,
        file=events_file,
    ) >= 0), "There should be at least one match"

    assert (mce_helpers.assert_mcp_entity_urn(
        "ALL",
        entity_type="dataset",
        regex_pattern=urn_pattern,
        file=events_file,
    ) >= 0), "There should be at least one MCP"

    # all dataset entities emitted must have a dataPlatformInstance aspect emitted
    # there must be at least one entity emitted
    assert (mce_helpers.assert_for_each_entity(
        entity_type="dataset",
        aspect_name="dataPlatformInstance",
        aspect_field_matcher={
            "instance":
            f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:{platform},{instance})"
        },
        file=events_file,
    ) >= 1)
Example #2
0
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path,
                      mock_time):

    # Run the metadata ingestion pipeline.
    with fs_helpers.isolated_filesystem(tmp_path):

        # Run the metadata ingestion pipeline for trino catalog referring to postgres database
        mce_out_file = "trino_mces.json"
        events_file = tmp_path / mce_out_file

        pipeline_config = {
            "run_id": "trino-test",
            "source": {
                "type":
                data_platform,
                "config":
                TrinoConfig(
                    host_port="localhost:5300",
                    database="postgresqldb",
                    database_alias="library_catalog",
                    username="******",
                    schema_pattern=AllowDenyPattern(allow=["^librarydb"]),
                    profile_pattern=AllowDenyPattern(
                        allow=["library_catalog.librarydb.*"]),
                    profiling=GEProfilingConfig(
                        enabled=True,
                        include_field_null_count=True,
                        include_field_min_value=True,
                        include_field_max_value=True,
                        include_field_mean_value=True,
                        include_field_median_value=True,
                        include_field_stddev_value=True,
                        include_field_quantiles=True,
                        include_field_distinct_value_frequencies=True,
                        include_field_histogram=True,
                        include_field_sample_values=True,
                    ),
                ).dict(),
            },
            "sink": {
                "type": "file",
                "config": FileSinkConfig(filename=str(events_file)).dict(),
            },
        }

        # Run the metadata ingestion pipeline.
        pipeline = Pipeline.create(pipeline_config)
        pipeline.run()
        pipeline.pretty_print_summary()
        pipeline.raise_from_status(raise_warnings=True)
        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path="trino_mces.json",
            golden_path=test_resources_dir / "trino_mces_golden.json",
        )
Example #3
0
def test_trino_hive_ingest(loaded_trino, test_resources_dir, pytestconfig,
                           tmp_path, mock_time):

    # Run the metadata ingestion pipeline for trino catalog referring to postgres database
    mce_out_file = "trino_hive_mces.json"
    events_file = tmp_path / mce_out_file

    pipeline_config = {
        "run_id": "trino-hive-test",
        "source": {
            "type":
            data_platform,
            "config":
            TrinoConfig(
                host_port="localhost:5300",
                database="hivedb",
                username="******",
                schema_pattern=AllowDenyPattern(allow=["^db1"]),
            ).dict(),
        },
        "sink": {
            "type": "file",
            "config": FileSinkConfig(filename=str(events_file)).dict(),
        },
    }

    # Run the metadata ingestion pipeline.
    pipeline = Pipeline.create(pipeline_config)
    pipeline.run()
    pipeline.pretty_print_summary()
    pipeline.raise_from_status(raise_warnings=True)

    # Limitation 1  - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database.
    # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070
    # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino.
    # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html

    # Run the metadata ingestion pipeline for trino catalog referring to hive database
    # config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve()
    # run_datahub_cmd(["ingest", "-c", f"{config_file}"])

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=events_file,
        golden_path=test_resources_dir / "trino_hive_mces_golden.json",
        ignore_paths=[
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]",
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]",
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]",
        ],
    )
Example #4
0
def base_pipeline_config(events_file):

    return {
        "run_id": "hive-test",
        "source": {
            "type":
            data_platform,
            "config":
            HiveConfig(scheme="hive",
                       database="db1",
                       host_port="localhost:10000").dict(),
        },
        "sink": {
            "type": "file",
            "config": FileSinkConfig(filename=str(events_file)).dict(),
        },
    }
def create_test_data(test_file):
    assertion_urn = "urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b"
    dataset_urn = make_dataset_urn(platform="postgres", name="foo")
    assertion_info = AssertionInfoClass(
        type=AssertionTypeClass.DATASET,
        customProperties={"suite_name": "demo_suite"},
        datasetAssertion=DatasetAssertionInfoClass(
            fields=[make_schema_field_urn(dataset_urn, "col1")],
            dataset=dataset_urn,
            scope=DatasetAssertionScopeClass.DATASET_COLUMN,
            operator=AssertionStdOperatorClass.LESS_THAN,
            nativeType="column_value_is_less_than",
            aggregation=AssertionStdAggregation.IDENTITY,
            nativeParameters={"max_value": "99"},
        ),
    )
    # The assertion definition
    mcp1 = MetadataChangeProposalWrapper(
        entityType="assertion",
        changeType="UPSERT",
        entityUrn=assertion_urn,
        aspectName="assertionInfo",
        aspect=assertion_info,
    )
    timestamps = [
        1643794280350,
        1643794280352,
        1643794280354,
        1643880726872,
        1643880726874,
        1643880726875,
    ]
    msg_ids = []
    # The assertion run event attached to the dataset
    mcp2 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[0],
            partitionSpec=PartitionSpecClass(
                partition="[{'country': 'IN'}]",
                type=PartitionTypeClass.PARTITION,
            ),
            messageId=str(timestamps[0]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.SUCCESS,
                actualAggValue=90,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )

    mcp3 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[1],
            partitionSpec=PartitionSpecClass(
                partition="[{'country': 'US'}]",
                type=PartitionTypeClass.PARTITION,
            ),
            messageId=str(timestamps[1]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.FAILURE,
                actualAggValue=101,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )
    # Result of evaluating this assertion on the whole dataset
    mcp4 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[2],
            partitionSpec=PartitionSpecClass(
                partition="FULL_TABLE_SNAPSHOT",
                type=PartitionTypeClass.FULL_TABLE,
            ),
            messageId=str(timestamps[2]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.SUCCESS,
                actualAggValue=93,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )

    mcp5 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[3],
            partitionSpec=PartitionSpecClass(
                partition="[{'country': 'IN'}]",
                type=PartitionTypeClass.PARTITION,
            ),
            messageId=str(timestamps[3]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.SUCCESS,
                actualAggValue=90,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )
    mcp6 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[4],
            partitionSpec=PartitionSpecClass(
                partition="[{'country': 'US'}]",
                type=PartitionTypeClass.PARTITION,
            ),
            messageId=str(timestamps[4]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.FAILURE,
                actualAggValue=101,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )

    # Result of evaluating this assertion on the whole dataset
    mcp7 = MetadataChangeProposalWrapper(
        entityType="assertion",
        entityUrn=assertion_urn,
        changeType="UPSERT",
        aspectName="assertionRunEvent",
        aspect=AssertionRunEventClass(
            timestampMillis=timestamps[5],
            partitionSpec=PartitionSpecClass(
                partition="FULL_TABLE_SNAPSHOT",
                type=PartitionTypeClass.FULL_TABLE,
            ),
            messageId=str(timestamps[5]),
            assertionUrn=assertion_urn,
            asserteeUrn=dataset_urn,
            result=AssertionResultClass(
                type=AssertionResultTypeClass.SUCCESS,
                actualAggValue=93,
                externalUrl="http://example.com/uuid1",
            ),
            runId="uuid1",
            status=AssertionRunStatusClass.COMPLETE,
        ),
    )

    fileSink: FileSink = FileSink.create(
        FileSinkConfig(filename=test_file),
        ctx=PipelineContext(run_id="test-file"))
    for mcp in [mcp1, mcp2, mcp3, mcp4, mcp5, mcp6, mcp7]:
        fileSink.write_record_async(RecordEnvelope(record=mcp, metadata={}),
                                    write_callback=NoopWriteCallback())
    fileSink.close()
Example #6
0
def generate(
    schema_files: List[str],
    server: Optional[str],
    file: Optional[str],
    dot: Optional[str],
    png: Optional[str],
    extra_docs: Optional[str],
) -> None:
    logger.info(f"server = {server}")
    logger.info(f"file = {file}")
    logger.info(f"dot = {dot}")
    logger.info(f"png = {png}")

    entity_extra_docs = {}
    if extra_docs:
        for path in glob.glob(f"{extra_docs}/**/*.md", recursive=True):
            m = re.search("/docs/entities/(.*)/*.md", path)
            if m:
                entity_name = m.group(1)
                with open(path, "r") as doc_file:
                    file_contents = doc_file.read()
                    final_markdown = preprocess_markdown(file_contents)
                    entity_extra_docs[entity_name] = final_markdown

    for schema_file in schema_files:
        if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
            # registry file
            load_registry_file(schema_file)
        else:
            # schema file
            load_schema_file(schema_file)

    if entity_extra_docs:
        for entity_name in entity_extra_docs:

            entity_registry.get(
                entity_name).doc_file_contents = entity_extra_docs[entity_name]

    relationship_graph = RelationshipGraph()
    events = generate_stitched_record(relationship_graph)

    generated_docs_dir = "../docs/generated/metamodel"
    import shutil

    shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
    entity_names = [(x, entity_registry.get(x))
                    for x in generated_documentation]

    sorted_entity_names = get_sorted_entity_names(entity_names)

    index = 0
    for category, sorted_entities in sorted_entity_names:
        for entity_name in sorted_entities:
            entity_def = entity_registry.get(entity_name)

            entity_category = entity_def.category
            entity_dir = f"{generated_docs_dir}/entities/"
            import os

            os.makedirs(entity_dir, exist_ok=True)

            with open(f"{entity_dir}/{entity_name}.md", "w") as fp:
                fp.write("---\n")
                fp.write(f"sidebar_position: {index}\n")
                fp.write("---\n")
                fp.write(generated_documentation[entity_name])
                index += 1

    if file:
        logger.info(f"Will write events to {file}")
        Path(file).parent.mkdir(parents=True, exist_ok=True)
        fileSink = FileSink(
            PipelineContext(run_id="generated-metaModel"),
            FileSinkConfig(filename=file),
        )
        for e in events:
            fileSink.write_record_async(RecordEnvelope(e, metadata={}),
                                        write_callback=NoopWriteCallback())
        fileSink.close()
        pipeline_config = {
            "source": {
                "type": "file",
                "config": {
                    "filename": file
                },
            },
            "sink": {
                "type": "datahub-rest",
                "config": {
                    "server": "${DATAHUB_SERVER:-http://localhost:8080}",
                    "token": "${DATAHUB_TOKEN:-}",
                },
            },
            "run_id": "modeldoc-generated",
        }
        pipeline_file = Path(file).parent.absolute() / "pipeline.yml"
        with open(pipeline_file, "w") as f:
            json.dump(pipeline_config, f, indent=2)
            logger.info(f"Wrote pipeline to {pipeline_file}")

    if server:
        logger.info(f"Will send events to {server}")
        assert server.startswith(
            "http://"), "server address must start with http://"
        emitter = DatahubRestEmitter(gms_server=server)
        emitter.test_connection()
        for e in events:
            emitter.emit(e)

    if dot:
        logger.info(f"Will write dot file to {dot}")

        import pydot

        graph = pydot.Dot("my_graph", graph_type="graph")
        for node, adjacency in relationship_graph.map.items():
            my_node = pydot.Node(
                node,
                label=node,
                shape="box",
            )
            graph.add_node(my_node)
            if adjacency.self_loop:
                for relnship in adjacency.self_loop:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
            if adjacency.outgoing:
                for relnship in adjacency.outgoing:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
        Path(dot).parent.mkdir(parents=True, exist_ok=True)
        graph.write_raw(dot)
        if png:
            try:
                graph.write_png(png)
            except Exception as e:
                logger.error(
                    "Failed to create png file. Do you have graphviz installed?"
                )
                raise e
Example #7
0
def generate(schema_files: List[str], server: Optional[str],
             file: Optional[str], dot: Optional[str],
             png: Optional[str]) -> None:
    logger.info(f"server = {server}")
    logger.info(f"file = {file}")
    logger.info(f"dot = {dot}")
    logger.info(f"png = {png}")

    for schema_file in schema_files:
        if schema_file.endswith(".yml") or schema_file.endswith(".yaml"):
            # registry file
            load_registry_file(schema_file)
        else:
            # schema file
            load_schema_file(schema_file)

    relationship_graph = RelationshipGraph()
    events = generate_stitched_record(relationship_graph)

    if file:
        logger.info(f"Will write events to {file}")
        Path(file).parent.mkdir(parents=True, exist_ok=True)
        fileSink = FileSink(
            PipelineContext(run_id="generated-metaModel"),
            FileSinkConfig(filename=file),
        )
        for e in events:
            fileSink.write_record_async(RecordEnvelope(e, metadata={}),
                                        write_callback=NoopWriteCallback())
        fileSink.close()
        pipeline_config = {
            "source": {
                "type": "file",
                "config": {
                    "filename": file
                },
            },
            "sink": {
                "type": "datahub-rest",
                "config": {
                    "server": "${DATAHUB_SERVER:-http://localhost:8080}",
                    "token": "${DATAHUB_TOKEN:-}",
                },
            },
            "run_id": "modeldoc-generated",
        }
        pipeline_file = Path(file).parent.absolute() / "pipeline.yml"
        with open(pipeline_file, "w") as f:
            json.dump(pipeline_config, f, indent=2)
            logger.info(f"Wrote pipeline to {pipeline_file}")

    if server:
        logger.info(f"Will send events to {server}")
        assert server.startswith(
            "http://"), "server address must start with http://"
        emitter = DatahubRestEmitter(gms_server=server)
        emitter.test_connection()
        for e in events:
            emitter.emit(e)

    if dot:
        logger.info(f"Will write dot file to {dot}")

        import pydot

        graph = pydot.Dot("my_graph", graph_type="graph")
        for node, adjacency in relationship_graph.map.items():
            my_node = pydot.Node(
                node,
                label=node,
                shape="box",
            )
            graph.add_node(my_node)
            if adjacency.self_loop:
                for relnship in adjacency.self_loop:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
            if adjacency.outgoing:
                for relnship in adjacency.outgoing:
                    graph.add_edge(
                        pydot.Edge(src=relnship.src,
                                   dst=relnship.dst,
                                   label=relnship.name))
        Path(dot).parent.mkdir(parents=True, exist_ok=True)
        graph.write_raw(dot)
        if png:
            try:
                graph.write_png(png)
            except Exception as e:
                logger.error(
                    "Failed to create png file. Do you have graphviz installed?"
                )
                raise e