def _create_lineage_from_csql_datasource( self, csql_urn: str, csql_datasource: List[dict] ) -> Iterable[MetadataWorkUnit]: for datasource in csql_datasource: datasource_urn = builder.make_dataset_urn( self.platform, datasource.get("id", ""), self.config.env ) upstream_csql = UpstreamClass( dataset=csql_urn, type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_lineage = UpstreamLineage(upstreams=[upstream_csql]) yield self.get_metadata_change_proposal( datasource_urn, aspect_name="upstreamLineage", aspect=upstream_lineage )
def get_upstream_lineage(upstream_urns: List[str]) -> UpstreamLineage: ucl: List[UpstreamClass] = [] actor, sys_time = "urn:li:corpuser:dbt_executor", int(time.time()) * 1000 for dep in upstream_urns: uc = UpstreamClass( dataset=dep, auditStamp=AuditStamp(actor=actor, time=sys_time), type=DatasetLineageTypeClass.TRANSFORMED, ) ucl.append(uc) ulc = UpstreamLineage(upstreams=ucl) return ulc
def _get_upsteam_lineage( self, looker_view: LookerView, actor: str, sys_time: int ) -> UpstreamLineage: upstreams = [] for sql_table_name in looker_view.sql_table_names: upstream = UpstreamClass( dataset=self._construct_datalineage_urn( sql_table_name, looker_view.connection ), auditStamp=AuditStamp(actor=actor, time=sys_time), type=DatasetLineageTypeClass.TRANSFORMED, ) upstreams.append(upstream) upstream_lineage = UpstreamLineage(upstreams=upstreams) return upstream_lineage
def get_lineage_mcp( self, dataset_urn: str ) -> Tuple[Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None, None if not self._lineage_map: self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] for upstream in item.upstreams: upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( upstream.platform.value, upstream.path, self.config.platform_instance, self.config.env, ), type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) properties = None if custom_properties: properties = DatasetPropertiesClass( customProperties=custom_properties) if not upstream_lineage: return None, properties mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=upstream_lineage), ) return mcp, properties
def _get_upstream_lineage(self, looker_view: LookerView) -> UpstreamLineage: upstreams = [] for sql_table_name in looker_view.sql_table_names: sql_table_name = sql_table_name.replace('"', "").replace("`", "") upstream = UpstreamClass( dataset=self._construct_datalineage_urn( sql_table_name, looker_view.connection), type=DatasetLineageTypeClass.TRANSFORMED, ) upstreams.append(upstream) upstream_lineage = UpstreamLineage(upstreams=upstreams) return upstream_lineage
def _get_upstream_lineage( self, looker_view: LookerView) -> Optional[UpstreamLineage]: upstreams = [] for sql_table_name in looker_view.sql_table_names: sql_table_name = sql_table_name.replace('"', "").replace("`", "") upstream = UpstreamClass( dataset=self._construct_datalineage_urn( sql_table_name, looker_view), type=DatasetLineageTypeClass.VIEW, ) upstreams.append(upstream) if upstreams != []: return UpstreamLineage(upstreams=upstreams) else: return None
def _to_metadata_events( # noqa: C901 self, config: LookerCommonConfig, reporter: SourceReport, base_url: str) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]: # We only generate MCE-s for explores that contain from clauses and do NOT contain joins # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph. dataset_snapshot = DatasetSnapshot( urn=self.get_explore_urn(config), aspects=[], # we append to this list later on ) browse_paths = BrowsePathsClass( paths=[self.get_explore_browse_path(config)]) dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(StatusClass(removed=False)) custom_properties = {} if self.label is not None: custom_properties["looker.explore.label"] = str(self.label) if self.source_file is not None: custom_properties["looker.explore.file"] = str(self.source_file) dataset_props = DatasetPropertiesClass( description=self.description, customProperties=custom_properties, ) dataset_props.externalUrl = self._get_url(base_url) dataset_snapshot.aspects.append(dataset_props) if self.upstream_views is not None: assert self.project_name is not None upstreams = [ UpstreamClass( dataset=LookerViewId( project_name=self.project_name, model_name=self.model_name, view_name=view_name, ).get_urn(config), type=DatasetLineageTypeClass.VIEW, ) for view_name in sorted(self.upstream_views) ] upstream_lineage = UpstreamLineage(upstreams=upstreams) dataset_snapshot.aspects.append(upstream_lineage) if self.fields is not None: schema_metadata = LookerUtil._get_schema( platform_name=config.platform_name, schema_name=self.name, view_fields=self.fields, reporter=reporter, ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["explore"]), ) return [mce, mcp]
def _get_upstream_lineage_info( self, dataset_urn: str ) -> Optional[Tuple[UpstreamLineage, Dict[str, str]]]: dataset_key = builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: logger.warning( f"Invalid dataset urn {dataset_urn}. Could not get key!") return None if self._lineage_map is None: self._populate_lineage() self._populate_view_lineage() if self._external_lineage_map is None: self._populate_external_lineage() assert self._lineage_map is not None assert self._external_lineage_map is not None dataset_name = dataset_key.name lineage = self._lineage_map[dataset_name] external_lineage = self._external_lineage_map[dataset_name] if not (lineage or external_lineage): logger.debug(f"No lineage found for {dataset_name}") return None upstream_tables: List[UpstreamClass] = [] column_lineage: Dict[str, str] = {} for lineage_entry in lineage: # Update the table-lineage upstream_table_name = lineage_entry[0] if not self._is_dataset_allowed(upstream_table_name): continue upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( self.platform, upstream_table_name, self.config.platform_instance, self.config.env, ), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table) # Update column-lineage for each down-stream column. upstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[1]) ] downstream_columns = [ d["columnName"].lower() for d in json.loads(lineage_entry[2]) ] upstream_column_str = ( f"{upstream_table_name}({', '.join(sorted(upstream_columns))})" ) downstream_column_str = ( f"{dataset_name}({', '.join(sorted(downstream_columns))})") column_lineage_key = f"column_lineage[{upstream_table_name}]" column_lineage_value = ( f"{{{upstream_column_str} -> {downstream_column_str}}}") column_lineage[column_lineage_key] = column_lineage_value logger.debug(f"{column_lineage_key}:{column_lineage_value}") for external_lineage_entry in external_lineage: # For now, populate only for S3 if external_lineage_entry.startswith("s3://"): external_upstream_table = UpstreamClass( dataset=make_s3_urn(external_lineage_entry, self.config.env), type=DatasetLineageTypeClass.COPY, ) upstream_tables.append(external_upstream_table) if upstream_tables: logger.debug( f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}" ) if self.config.report_upstream_lineage: self.report.upstream_lineage[dataset_name] = [ u.dataset for u in upstream_tables ] return UpstreamLineage(upstreams=upstream_tables), column_lineage return None
# Construct upstream tables. upstream_tables: List[UpstreamClass] = [] upstream_table_1 = UpstreamClass( dataset=builder.make_dataset_urn("bigquery", "upstream_table_1", "PROD"), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table_1) upstream_table_2 = UpstreamClass( dataset=builder.make_dataset_urn("bigquery", "upstream_table_2", "PROD"), type=DatasetLineageTypeClass.TRANSFORMED, ) upstream_tables.append(upstream_table_2) # Construct a lineage object. upstream_lineage = UpstreamLineage(upstreams=upstream_tables) # Construct a MetadataChangeProposalWrapper object. lineage_mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=builder.make_dataset_urn("bigquery", "downstream"), aspectName="upstreamLineage", aspect=upstream_lineage, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(lineage_mcp)
def get_lineage_mcp( self, dataset_urn: str ) -> Tuple[Optional[MetadataChangeProposalWrapper], Optional[DatasetPropertiesClass]]: dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) if dataset_key is None: return None, None if self._lineage_map is None: logger.debug("Populating lineage") self._populate_lineage() assert self._lineage_map is not None upstream_lineage: List[UpstreamClass] = [] custom_properties: Dict[str, str] = {} if dataset_key.name in self._lineage_map: item = self._lineage_map[dataset_key.name] if (self.config.capture_lineage_query_parser_failures and item.query_parser_failed_sqls): custom_properties[ "lineage_sql_parser_failed_queries"] = ",".join( item.query_parser_failed_sqls) for upstream in item.upstreams: upstream_table = UpstreamClass( dataset=builder.make_dataset_urn_with_platform_instance( upstream.platform.value, upstream.path, platform_instance=self.config.platform_instance_map. get(upstream.platform.value) if self.config.platform_instance_map else None, env=self.config.env, ), type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) dataset_params = dataset_key.name.split(".") db_name = dataset_params[0] schemaname = dataset_params[1] tablename = dataset_params[2] if db_name in self.catalog_metadata: if schemaname in self.catalog_metadata[db_name]: external_db_params = self.catalog_metadata[db_name][schemaname] upstream_platform = self.eskind_to_platform[ external_db_params["eskind"]] catalog_upstream = UpstreamClass( mce_builder.make_dataset_urn_with_platform_instance( upstream_platform, "{database}.{table}".format( database=external_db_params["external_database"], table=tablename, ), platform_instance=self.config.platform_instance_map. get(upstream_platform) if self.config.platform_instance_map else None, env=self.config.env, ), DatasetLineageTypeClass.COPY, ) upstream_lineage.append(catalog_upstream) properties = None if custom_properties: properties = DatasetPropertiesClass( customProperties=custom_properties) if upstream_lineage: self.report.upstream_lineage[dataset_urn] = [ u.dataset for u in upstream_lineage ] else: return None, properties mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=upstream_lineage), ) return mcp, properties
downstreams=[fldUrn("bar", "c5")], ), FineGrainedLineage( upstreamType=FineGrainedLineageUpstreamType.DATASET, upstreams=[datasetUrn("bar4")], downstreamType=FineGrainedLineageDownstreamType.FIELD_SET, downstreams=[fldUrn("bar", "c6"), fldUrn("bar", "c7")], ), ] # this is just to check if any conflicts with existing Upstream, particularly the DownstreamOf relationship upstream = Upstream(dataset=datasetUrn("bar2"), type=DatasetLineageType.TRANSFORMED) fieldLineages = UpstreamLineage(upstreams=[upstream], fineGrainedLineages=fineGrainedLineages) lineageMcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=datasetUrn("bar"), aspectName="upstreamLineage", aspect=fieldLineages, ) # Create an emitter to the GMS REST API. emitter = DatahubRestEmitter("http://localhost:8080") # Emit metadata! emitter.emit_mcp(lineageMcp)
def emit_datasource(self, datasource: dict, workbook: dict = None) -> Iterable[MetadataWorkUnit]: datasource_info = workbook if workbook is None: datasource_info = datasource project = (datasource_info.get("projectName", "").replace( "/", REPLACE_SLASH_CHAR) if datasource_info else "") datasource_id = datasource.get("id", "") datasource_name = f"{datasource.get('name')}.{datasource_id}" datasource_urn = builder.make_dataset_urn(self.platform, datasource_id, self.config.env) if datasource_id not in self.datasource_ids_being_used: self.datasource_ids_being_used.append(datasource_id) dataset_snapshot = DatasetSnapshot( urn=datasource_urn, aspects=[], ) # Browse path browse_paths = BrowsePathsClass(paths=[ f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource.get('name', '')}/{datasource_name}" ]) dataset_snapshot.aspects.append(browse_paths) # Ownership owner = (self._get_ownership( datasource_info.get("owner", {}).get("username", "")) if datasource_info else None) if owner is not None: dataset_snapshot.aspects.append(owner) # Dataset properties dataset_props = DatasetPropertiesClass( name=datasource.get("name"), description=datasource.get("description"), customProperties={ "hasExtracts": str(datasource.get("hasExtracts", "")), "extractLastRefreshTime": datasource.get("extractLastRefreshTime", "") or "", "extractLastIncrementalUpdateTime": datasource.get("extractLastIncrementalUpdateTime", "") or "", "extractLastUpdateTime": datasource.get("extractLastUpdateTime", "") or "", "type": datasource.get("__typename", ""), }, ) dataset_snapshot.aspects.append(dataset_props) # Upstream Tables if datasource.get("upstreamTables") is not None: # datasource -> db table relations upstream_tables = self._create_upstream_table_lineage( datasource, project) if upstream_tables: upstream_lineage = UpstreamLineage(upstreams=upstream_tables) yield self.get_metadata_change_proposal( datasource_urn, aspect_name="upstreamLineage", aspect=upstream_lineage, ) # Datasource Fields schema_metadata = self._get_schema_metadata_for_embedded_datasource( datasource.get("fields", [])) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, aspect_name="subTypes", aspect=SubTypesClass(typeNames=["Data Source"]), ) if datasource.get("__typename") == "EmbeddedDatasource": yield from add_entity_to_container(self.gen_workbook_key(workbook), "dataset", dataset_snapshot.urn)
def _process_table( self, dataset_name: str, inspector: Inspector, schema: str, table: str, sql_config: SQLAlchemyConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: columns = self._get_columns(dataset_name, inspector, schema, table) dataset_urn = make_dataset_urn_with_platform_instance( self.platform, dataset_name, self.config.platform_instance, self.config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[StatusClass(removed=False)], ) if self.is_stateful_ingestion_configured(): cur_checkpoint = self.get_current_checkpoint( self.get_default_ingestion_job_id()) if cur_checkpoint is not None: checkpoint_state = cast(BaseSQLAlchemyCheckpointState, cur_checkpoint.state) checkpoint_state.add_table_urn(dataset_urn) description, properties, location_urn = self.get_table_properties( inspector, schema, table) dataset_properties = DatasetPropertiesClass( name=table, description=description, customProperties=properties, ) dataset_snapshot.aspects.append(dataset_properties) if location_urn: external_upstream_table = UpstreamClass( dataset=location_urn, type=DatasetLineageTypeClass.COPY, ) lineage_mcpw = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=[external_upstream_table]), ) lineage_wu = MetadataWorkUnit( id= f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}", mcp=lineage_mcpw, ) yield lineage_wu pk_constraints: dict = inspector.get_pk_constraint(table, schema) foreign_keys = self._get_foreign_keys(dataset_urn, inspector, schema, table) schema_fields = self.get_schema_fields(dataset_name, columns, pk_constraints) schema_metadata = get_schema_metadata( self.report, dataset_name, self.platform, columns, pk_constraints, foreign_keys, schema_fields, ) dataset_snapshot.aspects.append(schema_metadata) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container(dataset_urn, db_name, schema) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu dpi_aspect = self.get_dataplatform_instance_aspect( dataset_urn=dataset_urn) if dpi_aspect: yield dpi_aspect subtypes_aspect = MetadataWorkUnit( id=f"{dataset_name}-subtypes", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["table"]), ), ) yield subtypes_aspect yield from self._get_domain_wu( dataset_name=dataset_name, entity_urn=dataset_urn, entity_type="dataset", sql_config=sql_config, )