def _get_urn(self, strip_user_ids_from_email: bool) -> Optional[str]: if self.email is not None: if strip_user_ids_from_email: return builder.make_user_urn(self.email.split("@")[0]) else: return builder.make_user_urn(self.email) return None
def test_simple_dataset_ownership_tranformation(mock_time): no_owner_aspect = make_generic_dataset() with_owner_aspect = make_dataset_with_owner() not_a_dataset = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", aspects=[ models.DataJobInfoClass( name="User Deletions", description="Constructs the fct_users_deleted from logging_events", type=models.AzkabanJobTypeClass.SQL, ) ], ) ) inputs = [ no_owner_aspect, with_owner_aspect, not_a_dataset, ] transformer = SimpleAddDatasetOwnership.create( { "owner_urns": [ builder.make_user_urn("person1"), builder.make_user_urn("person2"), ] }, PipelineContext(run_id="test"), ) outputs = list( transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) ) assert len(outputs) == len(inputs) # Check the first entry. first_ownership_aspect = builder.get_aspect_if_available( outputs[0].record, models.OwnershipClass ) assert first_ownership_aspect assert len(first_ownership_aspect.owners) == 2 # Check the second entry. second_ownership_aspect = builder.get_aspect_if_available( outputs[1].record, models.OwnershipClass ) assert second_ownership_aspect assert len(second_ownership_aspect.owners) == 3 # Verify that the third entry is unchanged. assert inputs[2] == outputs[2].record
def test_simple_dataset_ownership_with_type_transformation(mock_time): input = make_generic_dataset() transformer = SimpleAddDatasetOwnership.create( { "owner_urns": [ builder.make_user_urn("person1"), ], "ownership_type": "PRODUCER", }, PipelineContext(run_id="test"), ) output = list( transformer.transform([ RecordEnvelope(input, metadata={}), RecordEnvelope(EndOfStream(), metadata={}), ])) assert len(output) == 3 # original MCE is unchanged assert input == output[0].record ownership_aspect = output[1].record.aspect assert isinstance(ownership_aspect, OwnershipClass) assert len(ownership_aspect.owners) == 1 assert ownership_aspect.owners[ 0].type == models.OwnershipTypeClass.PRODUCER
def construct_dashboard(self, space_name: str, report_info: dict) -> DashboardSnapshot: report_token = report_info.get("token", "") dashboard_urn = builder.make_dashboard_urn(self.platform, report_info.get("id", "")) dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) last_modified = ChangeAuditStamps() creator = self._get_creator( report_info.get("_links", {}).get("creator", {}).get("href", "")) if creator is not None: modified_actor = builder.make_user_urn(creator) modified_ts = int( dp.parse( f"{report_info.get('last_saved_at', 'now')}").timestamp() * 1000) created_ts = int( dp.parse( f"{report_info.get('created_at', 'now')}").timestamp() * 1000) title = report_info.get("name", "") or "" description = report_info.get("description", "") or "" last_modified = ChangeAuditStamps( created=AuditStamp(time=created_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) dashboard_info_class = DashboardInfoClass( description=description, title=title, charts=self._get_chart_urns(report_token), lastModified=last_modified, dashboardUrl=f"{self.config.connect_uri}/" f"{self.config.workspace}/" f"reports/{report_token}", customProperties={}, ) dashboard_snapshot.aspects.append(dashboard_info_class) # browse path browse_path = BrowsePathsClass(paths=[ f"/mode/{self.config.workspace}/" f"{space_name}/" f"{report_info.get('name')}" ]) dashboard_snapshot.aspects.append(browse_path) # Ownership ownership = self._get_ownership( self._get_creator( report_info.get("_links", {}).get("creator", {}).get("href", ""))) if ownership is not None: dashboard_snapshot.aspects.append(ownership) return dashboard_snapshot
def test_pattern_dataset_ownership_with_type_transformation(mock_time): input = make_generic_dataset() transformer = PatternAddDatasetOwnership.create( { "owner_pattern": { "rules": { ".*example1.*": [builder.make_user_urn("person1")], } }, "ownership_type": "PRODUCER", }, PipelineContext(run_id="test"), ) output = list(transformer.transform([RecordEnvelope(input, metadata={})])) assert len(output) == 1 ownership_aspect = builder.get_aspect_if_available(output[0].record, models.OwnershipClass) assert ownership_aspect assert len(ownership_aspect.owners) == 1 assert ownership_aspect.owners[ 0].type == models.OwnershipTypeClass.PRODUCER
def emit_workbook_as_container( self, workbook: Dict) -> Iterable[MetadataWorkUnit]: workbook_container_key = self.gen_workbook_key(workbook) creator = workbook.get("owner", {}).get("username") owner_urn = (builder.make_user_urn(creator) if (creator and self.config.ingest_owner) else None) site_part = f"/site/{self.config.site}" if self.config.site else "" workbook_uri = workbook.get("uri", "") workbook_part = (workbook_uri[workbook_uri.index("/workbooks/"):] if workbook.get("uri") else None) workbook_external_url = ( f"{self.config.connect_uri}/#{site_part}{workbook_part}" if workbook_part else None) tag_list = workbook.get("tags", []) tag_list_str = ( [t.get("name", "").upper() for t in tag_list if t is not None] if (tag_list and self.config.ingest_tags) else None) container_workunits = gen_containers( container_key=workbook_container_key, name=workbook.get("name", ""), sub_types=["Workbook"], description=workbook.get("description"), owner_urn=owner_urn, external_url=workbook_external_url, tags=tag_list_str, ) for wu in container_workunits: self.report.report_workunit(wu) yield wu
def _aggregate_operation_aspect_events( self, events: List[RedshiftJoinedAccessEvent], operation_type: Union[str, "OperationTypeClass"], ) -> Iterable[MetadataWorkUnit]: for event in events: if (event.database and event.usename and event.schema_ and event.table and event.endtime): resource = f"{event.database}.{event.schema_}.{event.table}" last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) user_email = event.usename operation_aspect = OperationClass( timestampMillis=last_updated_timestamp, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(user_email.split("@")[0]), operationType=operation_type, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn("redshift", resource.lower(), self.config.env), aspect=operation_aspect, ) wu = MetadataWorkUnit( id= f"operation-aspect-{event.table}-{event.endtime.isoformat()}", mcp=mcp, ) yield wu
def _get_operation_aspect_work_units( self, events: Iterable[SnowflakeJoinedAccessEvent] ) -> Iterable[MetadataWorkUnit]: for event in events: if event.query_start_time and event.query_type in OPERATION_STATEMENT_TYPES: start_time = event.query_start_time query_type = event.query_type user_email = event.email operation_type = OPERATION_STATEMENT_TYPES[query_type] last_updated_timestamp: int = int(start_time.timestamp() * 1000) user_urn = builder.make_user_urn(user_email.split("@")[0]) for obj in event.base_objects_accessed: resource = obj.objectName dataset_urn = builder.make_dataset_urn( "snowflake", resource.lower(), self.config.env) operation_aspect = OperationClass( timestampMillis=last_updated_timestamp, lastUpdatedTimestamp=last_updated_timestamp, actor=user_urn, operationType=operation_type, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspect=operation_aspect, ) wu = MetadataWorkUnit( id= f"operation-aspect-{resource}-{start_time.isoformat()}", mcp=mcp, ) yield wu
def _get_ownership(self, creator_id: int) -> Optional[OwnershipClass]: user_info_url = f"{self.config.connect_uri}/api/user/{creator_id}" try: user_info_response = self.session.get(user_info_url) user_info_response.raise_for_status() user_details = user_info_response.json() except HTTPError as http_error: self.report.report_failure( key=f"metabase-user-{creator_id}", reason=f"Unable to retrieve User info. " f"Reason: {str(http_error)}", ) return None owner_urn = builder.make_user_urn(user_details.get("email", "")) if owner_urn is not None: ownership: OwnershipClass = OwnershipClass(owners=[ OwnerClass( owner=owner_urn, type=OwnershipTypeClass.DATAOWNER, ) ]) return ownership return None
def construct_dashboard_from_api_data( self, dashboard_info: dict) -> Optional[DashboardSnapshot]: dashboard_id = dashboard_info.get("id", "") dashboard_url = f"{self.config.connect_uri}/api/dashboard/{dashboard_id}" try: dashboard_response = self.session.get(dashboard_url) dashboard_response.raise_for_status() dashboard_details = dashboard_response.json() except HTTPError as http_error: self.report.report_failure( key=f"metabase-dashboard-{dashboard_id}", reason=f"Unable to retrieve dashboard. " f"Reason: {str(http_error)}", ) return None dashboard_urn = builder.make_dashboard_urn( self.platform, dashboard_details.get("id", "")) dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, aspects=[], ) last_edit_by = dashboard_details.get("last-edit-info") or {} modified_actor = builder.make_user_urn( last_edit_by.get("email", "unknown")) modified_ts = self.get_timestamp_millis_from_ts_string( f"{last_edit_by.get('timestamp')}") title = dashboard_details.get("name", "") or "" description = dashboard_details.get("description", "") or "" last_modified = ChangeAuditStamps( created=AuditStamp(time=modified_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) chart_urns = [] cards_data = dashboard_details.get("ordered_cards", "{}") for card_info in cards_data: chart_urn = builder.make_chart_urn(self.platform, card_info.get("id", "")) chart_urns.append(chart_urn) dashboard_info_class = DashboardInfoClass( description=description, title=title, charts=chart_urns, lastModified=last_modified, dashboardUrl=f"{self.config.connect_uri}/dashboard/{dashboard_id}", customProperties={}, ) dashboard_snapshot.aspects.append(dashboard_info_class) # Ownership ownership = self._get_ownership(dashboard_details.get( "creator_id", "")) if ownership is not None: dashboard_snapshot.aspects.append(ownership) return dashboard_snapshot
def generate_ownership_aspect(self): ownership = OwnershipClass( owners=[ OwnerClass( owner=builder.make_user_urn(owner), type=OwnershipTypeClass.DEVELOPER, source=OwnershipSourceClass( type=OwnershipSourceTypeClass.SERVICE, # url=dag.filepath, ), ) for owner in (self.owners or []) ], lastModified=AuditStampClass(time=0, actor=builder.make_user_urn( self.orchestrator)), ) return [ownership]
class AddDatasetOwnershipConfig(ConfigModel): # Workaround for https://github.com/python/mypy/issues/708. # Suggested by https://stackoverflow.com/a/64528725/5004662. get_owners_to_add: Union[Callable[[DatasetSnapshotClass], List[OwnerClass]], Callable[[DatasetSnapshotClass], List[OwnerClass]], ] default_actor: str = builder.make_user_urn("etl")
class SimpleDatasetOwnershipConfig(DatasetOwnershipBaseConfig): owner_urns: List[str] default_actor: str = builder.make_user_urn("etl") semantics: Semantics = Semantics.OVERWRITE @validator("semantics", pre=True) def upper_case_semantics(cls, v): if isinstance(v, str): return v.upper() return v
def test_simple_dataset_ownership_with_invalid_type_transformation(mock_time): with pytest.raises(ValueError): SimpleAddDatasetOwnership.create( { "owner_urns": [ builder.make_user_urn("person1"), ], "ownership_type": "INVALID_TYPE", }, PipelineContext(run_id="test"), )
def _get_ownership(self, user: str) -> Optional[OwnershipClass]: if user is not None: owner_urn = builder.make_user_urn(user) ownership: OwnershipClass = OwnershipClass(owners=[ OwnerClass( owner=owner_urn, type=OwnershipTypeClass.DATAOWNER, ) ]) return ownership return None
def _create_operation_aspect_work_unit( self, event: QueryEvent) -> Optional[MetadataWorkUnit]: if event.statementType in OPERATION_STATEMENT_TYPES and event.destinationTable: destination_table: BigQueryTableRef try: destination_table = event.destinationTable.remove_extras() except Exception as e: self.report.report_warning( str(event.destinationTable), f"Failed to clean up destination table, {e}", ) return None reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(event.timestamp.timestamp() * 1000) affected_datasets = [] if event.referencedTables: for table in event.referencedTables: try: affected_datasets.append( _table_ref_to_urn( table.remove_extras(), self.config.env, )) except Exception as e: self.report.report_warning( str(table), f"Failed to clean up table, {e}", ) operation_aspect = OperationClass( timestampMillis=reported_time, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(event.actor_email.split("@")[0]), operationType=OPERATION_STATEMENT_TYPES[event.statementType], affectedDatasets=affected_datasets, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=_table_ref_to_urn( destination_table, env=self.config.env, ), aspect=operation_aspect, ) return MetadataWorkUnit( id= f"{event.timestamp.isoformat()}-operation-aspect-{destination_table}", mcp=mcp, ) return None
def get_last_modified(self, creator: str, created_at: bytes, updated_at: bytes) -> ChangeAuditStamps: last_modified = ChangeAuditStamps() if creator: modified_actor = builder.make_user_urn(creator) created_ts = int(dp.parse(created_at).timestamp() * 1000) modified_ts = int(dp.parse(updated_at).timestamp() * 1000) last_modified = ChangeAuditStamps( created=AuditStamp(time=created_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) return last_modified
def test_pattern_dataset_ownership_with_invalid_type_transformation(mock_time): with pytest.raises(ValueError): PatternAddDatasetOwnership.create( { "owner_pattern": { "rules": { ".*example1.*": [builder.make_user_urn("person1")], } }, "ownership_type": "INVALID_TYPE", }, PipelineContext(run_id="test"), )
def _gen_operation_aspect_workunits_from_access_events( self, events_iterable: Iterable[RedshiftAccessEvent], ) -> Iterable[MetadataWorkUnit]: self.report.num_operational_stats_workunits_emitted = 0 for event in events_iterable: if not ( event.database and event.username and event.schema_ and event.table and event.endtime and event.operation_type ): continue assert event.operation_type in ["insert", "delete"] resource: str = f"{event.database}.{event.schema_}.{event.table}" reported_time: int = int(time.time() * 1000) last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) user_email: str = event.username operation_aspect = OperationClass( timestampMillis=reported_time, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(user_email.split("@")[0]), operationType=( OperationTypeClass.INSERT if event.operation_type == "insert" else OperationTypeClass.DELETE ), ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn_with_platform_instance( "redshift", resource.lower(), self.config.platform_instance, self.config.env, ), aspect=operation_aspect, ) wu = MetadataWorkUnit( id=f"operation-aspect-{event.table}-{event.endtime.isoformat()}", mcp=mcp, ) self.report.report_workunit(wu) self.report.num_operational_stats_workunits_emitted += 1 yield wu
def make_usage_workunit( self, bucket_duration: BucketDuration, urn_builder: Callable[[ResourceType], str], top_n_queries: int, format_sql_queries: bool, ) -> MetadataWorkUnit: budget_per_query: int = int(self.total_budget_for_query_list / top_n_queries) usageStats = DatasetUsageStatisticsClass( timestampMillis=int(self.bucket_start_time.timestamp() * 1000), eventGranularity=TimeWindowSizeClass(unit=bucket_duration, multiple=1), uniqueUserCount=len(self.userFreq), totalSqlQueries=self.queryCount, topSqlQueries=[ self.trim_query( format_sql_query(query, keyword_case="upper", reindent_aligned=True) if format_sql_queries else query, budget_per_query, ) for query, _ in self.queryFreq.most_common(top_n_queries) ], userCounts=[ DatasetUserUsageCountsClass( user=builder.make_user_urn(user_email.split("@")[0]), count=count, userEmail=user_email, ) for user_email, count in self.userFreq.most_common() ], fieldCounts=[ DatasetFieldUsageCountsClass( fieldPath=column, count=count, ) for column, count in self.columnFreq.most_common() ], ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="datasetUsageStatistics", changeType=ChangeTypeClass.UPSERT, entityUrn=urn_builder(self.resource), aspect=usageStats, ) return MetadataWorkUnit( id=f"{self.bucket_start_time.isoformat()}-{self.resource}", mcp=mcp )
def get_owners(owners: Owners) -> models.OwnershipClass: owners_meta: List[models.OwnerClass] = [] if owners.users is not None: owners_meta = owners_meta + [ models.OwnerClass( owner=make_user_urn(o), type=models.OwnershipTypeClass.DEVELOPER, ) for o in owners.users ] if owners.groups is not None: owners_meta = owners_meta + [ models.OwnerClass( owner=make_group_urn(o), type=models.OwnershipTypeClass.DEVELOPER, ) for o in owners.groups ] return models.OwnershipClass(owners=owners_meta)
class AddDatasetOwnershipConfig(ConfigModel): # Workaround for https://github.com/python/mypy/issues/708. # Suggested by https://stackoverflow.com/a/64528725/5004662. get_owners_to_add: Union[ Callable[[DatasetSnapshotClass], List[OwnerClass]], Callable[[DatasetSnapshotClass], List[OwnerClass]], ] default_actor: str = builder.make_user_urn("etl") semantics: Semantics = Semantics.OVERWRITE _resolve_owner_fn = pydantic_resolve_key("get_owners_to_add") @validator("semantics", pre=True) def ensure_semantics_is_upper_case(cls, v): if isinstance(v, str): return v.upper() return v
def make_dataset_with_owner() -> models.MetadataChangeEventClass: return models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", aspects=[ models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn("fake_owner"), type=models.OwnershipTypeClass.DATAOWNER, ), ], lastModified=models.AuditStampClass( time=1625266033123, actor="urn:li:corpuser:datahub"), ) ], ), )
def to_datahub_user( self, user: PowerBiAPI.User) -> List[MetadataChangeProposalWrapper]: """ Map PowerBi user to datahub user """ LOGGER.info("Converting user {}(id={}) to datahub's user".format( user.displayName, user.id)) # Create an URN for user user_urn = builder.make_user_urn(user.get_urn_part()) user_info_instance = CorpUserInfoClass( displayName=user.displayName, email=user.emailAddress, title=user.displayName, active=True, ) info_mcp = self.new_mcp( entity_type=Constant.CORP_USER, entity_urn=user_urn, aspect_name=Constant.CORP_USER_INFO, aspect=user_info_instance, ) # removed status mcp status_mcp = self.new_mcp( entity_type=Constant.CORP_USER, entity_urn=user_urn, aspect_name=Constant.STATUS, aspect=StatusClass(removed=False), ) user_key = CorpUserKeyClass(username=user.id) user_key_mcp = self.new_mcp( entity_type=Constant.CORP_USER, entity_urn=user_urn, aspect_name=Constant.CORP_USER_KEY, aspect=user_key, ) return [info_mcp, status_mcp, user_key_mcp]
def _gen_operation_aspect_workunits_by_type_from_access_events( self, events_iterable: Iterable[RedshiftAccessEvent], operation_type: Union[str, "OperationTypeClass"], ) -> Iterable[MetadataWorkUnit]: for event in events_iterable: if not (event.database and event.username and event.schema_ and event.table and event.endtime): continue resource: str = f"{event.database}.{event.schema_}.{event.table}" last_updated_timestamp: int = int(event.endtime.timestamp() * 1000) user_email: str = event.username operation_aspect = OperationClass( timestampMillis=last_updated_timestamp, lastUpdatedTimestamp=last_updated_timestamp, actor=builder.make_user_urn(user_email.split("@")[0]), operationType=operation_type, ) mcp = MetadataChangeProposalWrapper( entityType="dataset", aspectName="operation", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn_with_platform_instance( "redshift", resource.lower(), self.config.platform_instance, self.config.env, ), aspect=operation_aspect, ) wu = MetadataWorkUnit( id= f"operation-aspect-{event.table}-{event.endtime.isoformat()}", mcp=mcp, ) self.report.report_workunit(wu) yield wu
def construct_card_from_api_data( self, card_data: dict) -> Optional[ChartSnapshot]: card_id = card_data.get("id", "") card_url = f"{self.config.connect_uri}/api/card/{card_id}" try: card_response = self.session.get(card_url) card_response.raise_for_status() card_details = card_response.json() except HTTPError as http_error: self.report.report_failure( key=f"metabase-card-{card_id}", reason=f"Unable to retrieve Card info. " f"Reason: {str(http_error)}", ) return None chart_urn = builder.make_chart_urn(self.platform, card_id) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[], ) last_edit_by = card_details.get("last-edit-info") or {} modified_actor = builder.make_user_urn( last_edit_by.get("email", "unknown")) modified_ts = self.get_timestamp_millis_from_ts_string( f"{last_edit_by.get('timestamp')}") last_modified = ChangeAuditStamps( created=AuditStamp(time=modified_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) chart_type = self._get_chart_type(card_details.get("id", ""), card_details.get("display")) description = card_details.get("description") or "" title = card_details.get("name") or "" datasource_urn = self.get_datasource_urn(card_details) custom_properties = self.construct_card_custom_properties(card_details) chart_info = ChartInfoClass( type=chart_type, description=description, title=title, lastModified=last_modified, chartUrl=f"{self.config.connect_uri}/card/{card_id}", inputs=datasource_urn, customProperties=custom_properties, ) chart_snapshot.aspects.append(chart_info) if card_details.get("query_type", "") == "native": raw_query = (card_details.get("dataset_query", {}).get("native", {}).get("query", "")) chart_query_native = ChartQueryClass( rawQuery=raw_query, type=ChartQueryTypeClass.SQL, ) chart_snapshot.aspects.append(chart_query_native) # Ownership ownership = self._get_ownership(card_details.get("creator_id", "")) if ownership is not None: chart_snapshot.aspects.append(ownership) return chart_snapshot
def send_lineage( operator: "BaseOperator", inlets: Optional[List] = None, outlets: Optional[List] = None, context: Dict = None, ) -> None: context = context or {} # ensure not None to satisfy mypy dag: "DAG" = context["dag"] task = context["task"] # task_instance: "TaskInstance" = context["task_instance"] # TODO: verify if task and operator are the same? # TODO: use dag serialization to just save the whole thing. # TODO: save context.get("conf") # TODO: save DAG tags # TODO: save context.get("dag_run") # TODO: save all the data from task_instance # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) timestamp = int( dateutil.parser.parse(context["ts"]).timestamp() * 1000) ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=dag.owner, type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=timestamp, actor=builder.make_user_urn("airflow")), ) flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md}", ), ownership, ], )) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=None, # TODO: add datajob description ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), ), ownership, ], )) lineage_mces = [ builder.make_lineage_mce(_entities_to_urn_list(inlets or []), outlet) for outlet in _entities_to_urn_list(outlets or []) ] hook = make_emitter_hook() mces = [ flow_mce, job_mce, *lineage_mces, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough) from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph # Imports for metadata model classes from datahub.metadata.schema_classes import ( ChangeTypeClass, OwnerClass, OwnershipClass, OwnershipTypeClass, ) log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) # Inputs -> owner, ownership_type, dataset owner_to_add = make_user_urn("jdoe") ownership_type = OwnershipTypeClass.DATAOWNER dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") # Some objects to help with conditional pathways later owner_class_to_add = OwnerClass(owner=owner_to_add, type=ownership_type) ownership_to_add = OwnershipClass(owners=[owner_class_to_add]) # First we get the current owners gms_endpoint = "http://localhost:8080" graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) current_owners: Optional[OwnershipClass] = graph.get_aspect_v2( entity_urn=dataset_urn,
def send_lineage_to_datahub( config: DatahubBasicLineageConfig, operator: "BaseOperator", inlets: List[_Entity], outlets: List[_Entity], context: Dict, ) -> None: # This is necessary to avoid issues with circular imports. from airflow.serialization.serialized_objects import ( SerializedBaseOperator, SerializedDAG, ) dag: "DAG" = context["dag"] task: "BaseOperator" = context["task"] # resolve URNs for upstream nodes in subdags upstream of the current task. upstream_subdag_task_urns: List[str] = [] for upstream_task_id in task.upstream_task_ids: upstream_task = dag.task_dict[upstream_task_id] # if upstream task is not a subdag, then skip it if upstream_task.subdag is None: continue # else, link the leaf tasks of the upstream subdag as upstream tasks upstream_subdag = upstream_task.subdag upstream_subdag_flow_urn = builder.make_data_flow_urn( "airflow", upstream_subdag.dag_id, config.cluster) for upstream_subdag_task_id in upstream_subdag.task_dict: upstream_subdag_task = upstream_subdag.task_dict[ upstream_subdag_task_id] upstream_subdag_task_urn = builder.make_data_job_urn_with_flow( upstream_subdag_flow_urn, upstream_subdag_task_id) # if subdag task is a leaf task, then link it as an upstream task if len(upstream_subdag_task._downstream_task_ids) == 0: upstream_subdag_task_urns.append(upstream_subdag_task_urn) # resolve URNs for upstream nodes that trigger the subdag containing the current task. # (if it is in a subdag at all) upstream_subdag_triggers: List[str] = [] # subdags are always named with 'parent.child' style or Airflow won't run them # add connection from subdag trigger(s) if subdag task has no upstreams if (dag.is_subdag and dag.parent_dag is not None and len(task._upstream_task_ids) == 0): # filter through the parent dag's tasks and find the subdag trigger(s) subdags = [ x for x in dag.parent_dag.task_dict.values() if x.subdag is not None ] matched_subdags = [ x for x in subdags if getattr(getattr(x, "subdag"), "dag_id") == dag.dag_id ] # id of the task containing the subdag subdag_task_id = matched_subdags[0].task_id parent_dag_urn = builder.make_data_flow_urn("airflow", dag.parent_dag.dag_id, config.cluster) # iterate through the parent dag's tasks and find the ones that trigger the subdag for upstream_task_id in dag.parent_dag.task_dict: upstream_task = dag.parent_dag.task_dict[upstream_task_id] upstream_task_urn = builder.make_data_job_urn_with_flow( parent_dag_urn, upstream_task_id) # if the task triggers the subdag, link it to this node in the subdag if subdag_task_id in upstream_task._downstream_task_ids: upstream_subdag_triggers.append(upstream_task_urn) # TODO: capture context # context dag_run # task_instance: "TaskInstance" = context["task_instance"] # TODO: capture raw sql from db operators flow_urn = builder.make_data_flow_urn("airflow", dag.dag_id, config.cluster) job_urn = builder.make_data_job_urn_with_flow(flow_urn, task.task_id) base_url = conf.get("webserver", "base_url") flow_url = f"{base_url}/tree?dag_id={dag.dag_id}" job_url = f"{base_url}/taskinstance/list/?flt1_dag_id_equals={dag.dag_id}&_flt_3_task_id={task.task_id}" # operator.log.info(f"{flow_url=}") # operator.log.info(f"{job_url=}") # operator.log.info(f"{dag.get_serialized_fields()=}") # operator.log.info(f"{task.get_serialized_fields()=}") # operator.log.info(f"{SerializedDAG.serialize_dag(dag)=}") flow_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedDAG.serialize_dag(dag).items() } for key in dag.get_serialized_fields(): if key not in flow_property_bag: flow_property_bag[key] = repr(getattr(dag, key)) job_property_bag: Dict[str, str] = { key: repr(value) for (key, value) in SerializedBaseOperator.serialize_operator(task).items() } for key in task.get_serialized_fields(): if key not in job_property_bag: job_property_bag[key] = repr(getattr(task, key)) # operator.log.info(f"{flow_property_bag=}") # operator.log.info(f"{job_property_bag=}") allowed_task_keys = [ "_downstream_task_ids", "_inlets", "_outlets", "_task_type", "_task_module", "depends_on_past", "email", "label", "execution_timeout", "end_date", "start_date", "sla", "sql", "task_id", "trigger_rule", "wait_for_downstream", ] job_property_bag = { k: v for (k, v) in job_property_bag.items() if k in allowed_task_keys } allowed_flow_keys = [ "_access_control", "_concurrency", "_default_view", "catchup", "fileloc", "is_paused_upon_creation", "start_date", "tags", "timezone", ] flow_property_bag = { k: v for (k, v) in flow_property_bag.items() if k in allowed_flow_keys } if config.capture_ownership_info: ownership = models.OwnershipClass( owners=[ models.OwnerClass( owner=builder.make_user_urn(dag.owner), type=models.OwnershipTypeClass.DEVELOPER, source=models.OwnershipSourceClass( type=models.OwnershipSourceTypeClass.SERVICE, url=dag.filepath, ), ) ], lastModified=models.AuditStampClass( time=0, actor=builder.make_user_urn("airflow")), ) # operator.log.info(f"{ownership=}") ownership_aspect = [ownership] else: ownership_aspect = [] if config.capture_tags_info: tags = models.GlobalTagsClass(tags=[ models.TagAssociationClass(tag=builder.make_tag_urn(tag)) for tag in (dag.tags or []) ]) # operator.log.info(f"{tags=}") tags_aspect = [tags] else: tags_aspect = [] flow_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataFlowSnapshotClass( urn=flow_urn, aspects=[ models.DataFlowInfoClass( name=dag.dag_id, description=f"{dag.description}\n\n{dag.doc_md or ''}", customProperties=flow_property_bag, externalUrl=flow_url, ), *ownership_aspect, *tags_aspect, ], )) # exclude subdag operator tasks since these are not emitted, resulting in empty metadata upstream_tasks = ([ builder.make_data_job_urn_with_flow(flow_urn, task_id) for task_id in task.upstream_task_ids if dag.task_dict[task_id].subdag is None ] + upstream_subdag_task_urns + upstream_subdag_triggers) job_doc = ((operator.doc or operator.doc_md or operator.doc_json or operator.doc_yaml or operator.doc_rst) if not AIRFLOW_1 else None) job_mce = models.MetadataChangeEventClass( proposedSnapshot=models.DataJobSnapshotClass( urn=job_urn, aspects=[ models.DataJobInfoClass( name=task.task_id, type=models.AzkabanJobTypeClass.COMMAND, description=job_doc, customProperties=job_property_bag, externalUrl=job_url, ), models.DataJobInputOutputClass( inputDatasets=_entities_to_urn_list(inlets or []), outputDatasets=_entities_to_urn_list(outlets or []), inputDatajobs=upstream_tasks, ), *ownership_aspect, *tags_aspect, ], )) force_entity_materialization = [ models.MetadataChangeEventClass( proposedSnapshot=models.DatasetSnapshotClass( urn=iolet, aspects=[ models.StatusClass(removed=False), ], )) for iolet in _entities_to_urn_list((inlets or []) + (outlets or [])) ] hook = config.make_emitter_hook() mces = [ flow_mce, job_mce, *force_entity_materialization, ] operator.log.info("DataHub lineage backend - emitting metadata:\n" + "\n".join(json.dumps(mce.to_obj()) for mce in mces)) hook.emit_mces(mces)
def construct_chart_from_api_data(self, chart_data: dict, query: dict, path: str) -> ChartSnapshot: chart_urn = builder.make_chart_urn(self.platform, chart_data.get("token", "")) chart_snapshot = ChartSnapshot( urn=chart_urn, aspects=[], ) last_modified = ChangeAuditStamps() creator = self._get_creator( chart_data.get("_links", {}).get("creator", {}).get("href", "")) if creator is not None: modified_actor = builder.make_user_urn(creator) created_ts = int( dp.parse(chart_data.get("created_at", "now")).timestamp() * 1000) modified_ts = int( dp.parse(chart_data.get("updated_at", "now")).timestamp() * 1000) last_modified = ChangeAuditStamps( created=AuditStamp(time=created_ts, actor=modified_actor), lastModified=AuditStamp(time=modified_ts, actor=modified_actor), ) chart_detail = (chart_data.get("view", {}) if len(chart_data.get("view", {})) != 0 else chart_data.get("view_vegas", {})) mode_chart_type = chart_detail.get( "chartType", "") or chart_detail.get("selectedChart", "") chart_type = self._get_chart_type(chart_data.get("token", ""), mode_chart_type) description = (chart_detail.get("description") or chart_detail.get("chartDescription") or "") title = chart_detail.get("title") or chart_detail.get( "chartTitle") or "" # create datasource urn platform, db_name = self._get_platform_and_dbname( query.get("data_source_id")) source_tables = self._get_source_from_query(query.get("raw_query")) datasource_urn = self._get_datasource_urn(platform, db_name, source_tables) custom_properties = self.construct_chart_custom_properties( chart_detail, mode_chart_type) # Chart Info chart_info = ChartInfoClass( type=chart_type, description=description, title=title, lastModified=last_modified, chartUrl=f"{self.config.connect_uri}" f"{chart_data.get('_links', {}).get('report_viz_web', {}).get('href', '')}", inputs=datasource_urn, customProperties=custom_properties, ) chart_snapshot.aspects.append(chart_info) # Browse Path browse_path = BrowsePathsClass(paths=[path]) chart_snapshot.aspects.append(browse_path) # Query chart_query = ChartQueryClass( rawQuery=query.get("raw_query", ""), type=ChartQueryTypeClass.SQL, ) chart_snapshot.aspects.append(chart_query) # Ownership ownership = self._get_ownership( self._get_creator( chart_data.get("_links", {}).get("creator", {}).get("href", ""))) if ownership is not None: chart_snapshot.aspects.append(ownership) return chart_snapshot