def test_pattern_dataset_ownership_transformation(mock_time): no_owner_aspect = make_generic_dataset() with_owner_aspect = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", aspects=[ models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn("fake_owner"), type=models.OwnershipTypeClass.DATAOWNER, ), ], lastModified=models.AuditStampClass( time=1625266033123, actor="urn:li:corpuser:datahub"), ) ], ), ) not_a_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn= "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", aspects=[ models.DataJobInfoClass( name="User Deletions", description= "Constructs the fct_users_deleted from logging_events", type=models.AzkabanJobTypeClass.SQL, ) ], )) inputs = [no_owner_aspect, with_owner_aspect, not_a_dataset, EndOfStream()] transformer = PatternAddDatasetOwnership.create( { "owner_pattern": { "rules": { ".*example1.*": [builder.make_user_urn("person1")], ".*example2.*": [builder.make_user_urn("person2")], } }, }, PipelineContext(run_id="test"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in inputs])) assert len( outputs) == len(inputs) + 1 # additional MCP due to the no-owner MCE # Check the first entry. assert inputs[0] == outputs[0].record first_ownership_aspect = outputs[3].record.aspect assert first_ownership_aspect assert len(first_ownership_aspect.owners) == 1 assert all([ owner.type == models.OwnershipTypeClass.DATAOWNER for owner in first_ownership_aspect.owners ]) # Check the second entry. second_ownership_aspect = builder.get_aspect_if_available( outputs[1].record, models.OwnershipClass) assert second_ownership_aspect assert len(second_ownership_aspect.owners) == 2 assert all([ owner.type == models.OwnershipTypeClass.DATAOWNER for owner in second_ownership_aspect.owners ]) # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record # Verify that the last entry is unchanged (EOS) assert inputs[-1] == outputs[-1].record
import json import pytest import requests import datahub.metadata.schema_classes as models from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter MOCK_GMS_ENDPOINT = "http://fakegmshost:8080" basicAuditStamp = models.AuditStampClass( time=1618987484580, actor="urn:li:corpuser:datahub", impersonator=None, ) @pytest.mark.parametrize( "record,path,snapshot", [ ( # Simple test. models. MetadataChangeEventClass(proposedSnapshot=models.DatasetSnapshotClass( urn= "urn:li:dataset:(urn:li:dataPlatform:bigquery,downstream,PROD)", aspects=[ models.UpstreamLineageClass(upstreams=[ models.UpstreamClass( auditStamp=basicAuditStamp,
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: timestamp = int( dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=[ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids ], ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
from pydantic import validator import datahub.metadata.schema_classes as models from datahub.configuration.common import ConfigModel from datahub.configuration.config_loader import load_config_file from datahub.emitter.mce_builder import get_sys_time, make_group_urn, make_user_urn from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit logger = logging.getLogger(__name__) valid_status: models.StatusClass = models.StatusClass(removed=False) auditStamp = models.AuditStampClass( time=get_sys_time(), actor="urn:li:corpUser:restEmitter" ) class Owners(ConfigModel): users: Optional[List[str]] groups: Optional[List[str]] class GlossaryTermConfig(ConfigModel): name: str description: str term_source: Optional[str] source_ref: Optional[str] source_url: Optional[str] owners: Optional[Owners]
def test_simple_dataset_ownership_tranformation(mock_time): no_owner_aspect = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)", aspects=[ models.StatusClass(removed=False), ], ), ) with_owner_aspect = models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", aspects=[ models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn("fake_owner"), type=models.OwnershipTypeClass.DATAOWNER, ), ], lastModified=models.AuditStampClass( time=builder.get_sys_time(), actor="urn:li:corpuser:datahub"), ) ], ), ) not_a_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn= "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", aspects=[ models.DataJobInfoClass( name="User Deletions", description= "Constructs the fct_users_deleted from logging_events", type=models.AzkabanJobTypeClass.SQL, ) ], )) inputs = [ no_owner_aspect, with_owner_aspect, not_a_dataset, ] transformer = SimpleAddDatasetOwnership.create( { "owner_urns": [ builder.make_user_urn("person1"), builder.make_user_urn("person2"), ] }, PipelineContext(run_id="test"), ) outputs = list( transformer.transform( [RecordEnvelope(input, metadata={}) for input in inputs])) assert len(outputs) == len(inputs) # Check the first entry. first_ownership_aspect = builder.get_aspect_if_available( outputs[0].record, models.OwnershipClass) assert first_ownership_aspect assert len(first_ownership_aspect.owners) == 2 # Check the second entry. second_ownership_aspect = builder.get_aspect_if_available( outputs[1].record, models.OwnershipClass) assert second_ownership_aspect assert len(second_ownership_aspect.owners) == 3 # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record