def test_trino_instance_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): instance = "production_warehouse" platform = "trino" mce_out_file = "trino_instance_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-hive-instance-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="hivedb", username="******", platform_instance="production_warehouse", schema_pattern=AllowDenyPattern(allow=["^db1"]), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Assert that all events generated have instance specific urns urn_pattern = "^" + re.escape( f"urn:li:dataset:(urn:li:dataPlatform:{platform},{instance}.") assert (mce_helpers.assert_mce_entity_urn( "ALL", entity_type="dataset", regex_pattern=urn_pattern, file=events_file, ) >= 0), "There should be at least one match" assert (mce_helpers.assert_mcp_entity_urn( "ALL", entity_type="dataset", regex_pattern=urn_pattern, file=events_file, ) >= 0), "There should be at least one MCP" # all dataset entities emitted must have a dataPlatformInstance aspect emitted # there must be at least one entity emitted assert (mce_helpers.assert_for_each_entity( entity_type="dataset", aspect_name="dataPlatformInstance", aspect_field_matcher={ "instance": f"urn:li:dataPlatformInstance:(urn:li:dataPlatform:{platform},{instance})" }, file=events_file, ) >= 1)
def test_trino_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): # Run the metadata ingestion pipeline. with fs_helpers.isolated_filesystem(tmp_path): # Run the metadata ingestion pipeline for trino catalog referring to postgres database mce_out_file = "trino_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="postgresqldb", database_alias="library_catalog", username="******", schema_pattern=AllowDenyPattern(allow=["^librarydb"]), profile_pattern=AllowDenyPattern( allow=["library_catalog.librarydb.*"]), profiling=GEProfilingConfig( enabled=True, include_field_null_count=True, include_field_min_value=True, include_field_max_value=True, include_field_mean_value=True, include_field_median_value=True, include_field_stddev_value=True, include_field_quantiles=True, include_field_distinct_value_frequencies=True, include_field_histogram=True, include_field_sample_values=True, ), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path="trino_mces.json", golden_path=test_resources_dir / "trino_mces_golden.json", )
def test_trino_hive_ingest(loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time): # Run the metadata ingestion pipeline for trino catalog referring to postgres database mce_out_file = "trino_hive_mces.json" events_file = tmp_path / mce_out_file pipeline_config = { "run_id": "trino-hive-test", "source": { "type": data_platform, "config": TrinoConfig( host_port="localhost:5300", database="hivedb", username="******", schema_pattern=AllowDenyPattern(allow=["^db1"]), ).dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, } # Run the metadata ingestion pipeline. pipeline = Pipeline.create(pipeline_config) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status(raise_warnings=True) # Limitation 1 - MCE contains "nullable": true for all fields in trino database, irrespective of not null constraints present in underlying postgres database. # This is issue with trino, also reported here - https://github.com/trinodb/trino/issues/6400, Related : https://github.com/trinodb/trino/issues/4070 # Limitation 2 - Dataset properties for postgres view (view definition, etc) are not part of MCE from trino. # Postgres views are exposed as tables in trino. This setting depends on trino connector implementation - https://trino.io/episodes/18.html # Run the metadata ingestion pipeline for trino catalog referring to hive database # config_file = (test_resources_dir / "trino_hive_to_file.yml").resolve() # run_datahub_cmd(["ingest", "-c", f"{config_file}"]) # Verify the output. mce_helpers.check_golden_file( pytestconfig, output_path=events_file, golden_path=test_resources_dir / "trino_hive_mces_golden.json", ignore_paths=[ r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['transient_lastddltime'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['numfiles'\]", r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['totalsize'\]", ], )
def base_pipeline_config(events_file): return { "run_id": "hive-test", "source": { "type": data_platform, "config": HiveConfig(scheme="hive", database="db1", host_port="localhost:10000").dict(), }, "sink": { "type": "file", "config": FileSinkConfig(filename=str(events_file)).dict(), }, }
def create_test_data(test_file): assertion_urn = "urn:li:assertion:2d3b06a6e77e1f24adc9860a05ea089b" dataset_urn = make_dataset_urn(platform="postgres", name="foo") assertion_info = AssertionInfoClass( type=AssertionTypeClass.DATASET, customProperties={"suite_name": "demo_suite"}, datasetAssertion=DatasetAssertionInfoClass( fields=[make_schema_field_urn(dataset_urn, "col1")], dataset=dataset_urn, scope=DatasetAssertionScopeClass.DATASET_COLUMN, operator=AssertionStdOperatorClass.LESS_THAN, nativeType="column_value_is_less_than", aggregation=AssertionStdAggregation.IDENTITY, nativeParameters={"max_value": "99"}, ), ) # The assertion definition mcp1 = MetadataChangeProposalWrapper( entityType="assertion", changeType="UPSERT", entityUrn=assertion_urn, aspectName="assertionInfo", aspect=assertion_info, ) timestamps = [ 1643794280350, 1643794280352, 1643794280354, 1643880726872, 1643880726874, 1643880726875, ] msg_ids = [] # The assertion run event attached to the dataset mcp2 = MetadataChangeProposalWrapper( entityType="assertion", entityUrn=assertion_urn, changeType="UPSERT", aspectName="assertionRunEvent", aspect=AssertionRunEventClass( timestampMillis=timestamps[0], partitionSpec=PartitionSpecClass( partition="[{'country': 'IN'}]", type=PartitionTypeClass.PARTITION, ), messageId=str(timestamps[0]), assertionUrn=assertion_urn, asserteeUrn=dataset_urn, result=AssertionResultClass( type=AssertionResultTypeClass.SUCCESS, actualAggValue=90, externalUrl="http://example.com/uuid1", ), runId="uuid1", status=AssertionRunStatusClass.COMPLETE, ), ) mcp3 = MetadataChangeProposalWrapper( entityType="assertion", entityUrn=assertion_urn, changeType="UPSERT", aspectName="assertionRunEvent", aspect=AssertionRunEventClass( timestampMillis=timestamps[1], partitionSpec=PartitionSpecClass( partition="[{'country': 'US'}]", type=PartitionTypeClass.PARTITION, ), messageId=str(timestamps[1]), assertionUrn=assertion_urn, asserteeUrn=dataset_urn, result=AssertionResultClass( type=AssertionResultTypeClass.FAILURE, actualAggValue=101, externalUrl="http://example.com/uuid1", ), runId="uuid1", status=AssertionRunStatusClass.COMPLETE, ), ) # Result of evaluating this assertion on the whole dataset mcp4 = MetadataChangeProposalWrapper( entityType="assertion", entityUrn=assertion_urn, changeType="UPSERT", aspectName="assertionRunEvent", aspect=AssertionRunEventClass( timestampMillis=timestamps[2], partitionSpec=PartitionSpecClass( partition="FULL_TABLE_SNAPSHOT", type=PartitionTypeClass.FULL_TABLE, ), messageId=str(timestamps[2]), assertionUrn=assertion_urn, asserteeUrn=dataset_urn, result=AssertionResultClass( type=AssertionResultTypeClass.SUCCESS, actualAggValue=93, externalUrl="http://example.com/uuid1", ), runId="uuid1", status=AssertionRunStatusClass.COMPLETE, ), ) mcp5 = MetadataChangeProposalWrapper( entityType="assertion", entityUrn=assertion_urn, changeType="UPSERT", aspectName="assertionRunEvent", aspect=AssertionRunEventClass( timestampMillis=timestamps[3], partitionSpec=PartitionSpecClass( partition="[{'country': 'IN'}]", type=PartitionTypeClass.PARTITION, ), messageId=str(timestamps[3]), assertionUrn=assertion_urn, asserteeUrn=dataset_urn, result=AssertionResultClass( type=AssertionResultTypeClass.SUCCESS, actualAggValue=90, externalUrl="http://example.com/uuid1", ), runId="uuid1", status=AssertionRunStatusClass.COMPLETE, ), ) mcp6 = MetadataChangeProposalWrapper( entityType="assertion", entityUrn=assertion_urn, changeType="UPSERT", aspectName="assertionRunEvent", aspect=AssertionRunEventClass( timestampMillis=timestamps[4], partitionSpec=PartitionSpecClass( partition="[{'country': 'US'}]", type=PartitionTypeClass.PARTITION, ), messageId=str(timestamps[4]), assertionUrn=assertion_urn, asserteeUrn=dataset_urn, result=AssertionResultClass( type=AssertionResultTypeClass.FAILURE, actualAggValue=101, externalUrl="http://example.com/uuid1", ), runId="uuid1", status=AssertionRunStatusClass.COMPLETE, ), ) # Result of evaluating this assertion on the whole dataset mcp7 = MetadataChangeProposalWrapper( entityType="assertion", entityUrn=assertion_urn, changeType="UPSERT", aspectName="assertionRunEvent", aspect=AssertionRunEventClass( timestampMillis=timestamps[5], partitionSpec=PartitionSpecClass( partition="FULL_TABLE_SNAPSHOT", type=PartitionTypeClass.FULL_TABLE, ), messageId=str(timestamps[5]), assertionUrn=assertion_urn, asserteeUrn=dataset_urn, result=AssertionResultClass( type=AssertionResultTypeClass.SUCCESS, actualAggValue=93, externalUrl="http://example.com/uuid1", ), runId="uuid1", status=AssertionRunStatusClass.COMPLETE, ), ) fileSink: FileSink = FileSink.create( FileSinkConfig(filename=test_file), ctx=PipelineContext(run_id="test-file")) for mcp in [mcp1, mcp2, mcp3, mcp4, mcp5, mcp6, mcp7]: fileSink.write_record_async(RecordEnvelope(record=mcp, metadata={}), write_callback=NoopWriteCallback()) fileSink.close()
def generate( schema_files: List[str], server: Optional[str], file: Optional[str], dot: Optional[str], png: Optional[str], extra_docs: Optional[str], ) -> None: logger.info(f"server = {server}") logger.info(f"file = {file}") logger.info(f"dot = {dot}") logger.info(f"png = {png}") entity_extra_docs = {} if extra_docs: for path in glob.glob(f"{extra_docs}/**/*.md", recursive=True): m = re.search("/docs/entities/(.*)/*.md", path) if m: entity_name = m.group(1) with open(path, "r") as doc_file: file_contents = doc_file.read() final_markdown = preprocess_markdown(file_contents) entity_extra_docs[entity_name] = final_markdown for schema_file in schema_files: if schema_file.endswith(".yml") or schema_file.endswith(".yaml"): # registry file load_registry_file(schema_file) else: # schema file load_schema_file(schema_file) if entity_extra_docs: for entity_name in entity_extra_docs: entity_registry.get( entity_name).doc_file_contents = entity_extra_docs[entity_name] relationship_graph = RelationshipGraph() events = generate_stitched_record(relationship_graph) generated_docs_dir = "../docs/generated/metamodel" import shutil shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True) entity_names = [(x, entity_registry.get(x)) for x in generated_documentation] sorted_entity_names = get_sorted_entity_names(entity_names) index = 0 for category, sorted_entities in sorted_entity_names: for entity_name in sorted_entities: entity_def = entity_registry.get(entity_name) entity_category = entity_def.category entity_dir = f"{generated_docs_dir}/entities/" import os os.makedirs(entity_dir, exist_ok=True) with open(f"{entity_dir}/{entity_name}.md", "w") as fp: fp.write("---\n") fp.write(f"sidebar_position: {index}\n") fp.write("---\n") fp.write(generated_documentation[entity_name]) index += 1 if file: logger.info(f"Will write events to {file}") Path(file).parent.mkdir(parents=True, exist_ok=True) fileSink = FileSink( PipelineContext(run_id="generated-metaModel"), FileSinkConfig(filename=file), ) for e in events: fileSink.write_record_async(RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback()) fileSink.close() pipeline_config = { "source": { "type": "file", "config": { "filename": file }, }, "sink": { "type": "datahub-rest", "config": { "server": "${DATAHUB_SERVER:-http://localhost:8080}", "token": "${DATAHUB_TOKEN:-}", }, }, "run_id": "modeldoc-generated", } pipeline_file = Path(file).parent.absolute() / "pipeline.yml" with open(pipeline_file, "w") as f: json.dump(pipeline_config, f, indent=2) logger.info(f"Wrote pipeline to {pipeline_file}") if server: logger.info(f"Will send events to {server}") assert server.startswith( "http://"), "server address must start with http://" emitter = DatahubRestEmitter(gms_server=server) emitter.test_connection() for e in events: emitter.emit(e) if dot: logger.info(f"Will write dot file to {dot}") import pydot graph = pydot.Dot("my_graph", graph_type="graph") for node, adjacency in relationship_graph.map.items(): my_node = pydot.Node( node, label=node, shape="box", ) graph.add_node(my_node) if adjacency.self_loop: for relnship in adjacency.self_loop: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) if adjacency.outgoing: for relnship in adjacency.outgoing: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) Path(dot).parent.mkdir(parents=True, exist_ok=True) graph.write_raw(dot) if png: try: graph.write_png(png) except Exception as e: logger.error( "Failed to create png file. Do you have graphviz installed?" ) raise e
def generate(schema_files: List[str], server: Optional[str], file: Optional[str], dot: Optional[str], png: Optional[str]) -> None: logger.info(f"server = {server}") logger.info(f"file = {file}") logger.info(f"dot = {dot}") logger.info(f"png = {png}") for schema_file in schema_files: if schema_file.endswith(".yml") or schema_file.endswith(".yaml"): # registry file load_registry_file(schema_file) else: # schema file load_schema_file(schema_file) relationship_graph = RelationshipGraph() events = generate_stitched_record(relationship_graph) if file: logger.info(f"Will write events to {file}") Path(file).parent.mkdir(parents=True, exist_ok=True) fileSink = FileSink( PipelineContext(run_id="generated-metaModel"), FileSinkConfig(filename=file), ) for e in events: fileSink.write_record_async(RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback()) fileSink.close() pipeline_config = { "source": { "type": "file", "config": { "filename": file }, }, "sink": { "type": "datahub-rest", "config": { "server": "${DATAHUB_SERVER:-http://localhost:8080}", "token": "${DATAHUB_TOKEN:-}", }, }, "run_id": "modeldoc-generated", } pipeline_file = Path(file).parent.absolute() / "pipeline.yml" with open(pipeline_file, "w") as f: json.dump(pipeline_config, f, indent=2) logger.info(f"Wrote pipeline to {pipeline_file}") if server: logger.info(f"Will send events to {server}") assert server.startswith( "http://"), "server address must start with http://" emitter = DatahubRestEmitter(gms_server=server) emitter.test_connection() for e in events: emitter.emit(e) if dot: logger.info(f"Will write dot file to {dot}") import pydot graph = pydot.Dot("my_graph", graph_type="graph") for node, adjacency in relationship_graph.map.items(): my_node = pydot.Node( node, label=node, shape="box", ) graph.add_node(my_node) if adjacency.self_loop: for relnship in adjacency.self_loop: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) if adjacency.outgoing: for relnship in adjacency.outgoing: graph.add_edge( pydot.Edge(src=relnship.src, dst=relnship.dst, label=relnship.name)) Path(dot).parent.mkdir(parents=True, exist_ok=True) graph.write_raw(dot) if png: try: graph.write_png(png) except Exception as e: logger.error( "Failed to create png file. Do you have graphviz installed?" ) raise e