def _create_subType_wu( self, node: DBTNode, node_datahub_urn: str) -> Optional[MetadataWorkUnit]: if not node.node_type: return None subtypes: Optional[List[str]] if node.node_type == "model": if node.materialization: subtypes = [node.materialization, "view"] else: subtypes = ["model", "view"] else: subtypes = [node.node_type] subtype_mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=node_datahub_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=subtypes), ) subtype_wu = MetadataWorkUnit( id= f"{self.platform}-{subtype_mcp.entityUrn}-{subtype_mcp.aspectName}", mcp=subtype_mcp, ) return subtype_wu
def _build_dataset_mcps( self, looker_view: LookerView) -> List[MetadataChangeProposalWrapper]: events = [] subTypeEvent = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=looker_view.id.get_urn(self.source_config), aspectName="subTypes", aspect=SubTypesClass(typeNames=["view"]), ) events.append(subTypeEvent) if looker_view.view_details is not None: viewEvent = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=looker_view.id.get_urn(self.source_config), aspectName="viewProperties", aspect=looker_view.view_details, ) events.append(viewEvent) return events
def _to_metadata_events( # noqa: C901 self, config: LookerCommonConfig, reporter: SourceReport, base_url: str) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]: # We only generate MCE-s for explores that contain from clauses and do NOT contain joins # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph. dataset_snapshot = DatasetSnapshot( urn=self.get_explore_urn(config), aspects=[], # we append to this list later on ) browse_paths = BrowsePathsClass( paths=[self.get_explore_browse_path(config)]) dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(StatusClass(removed=False)) custom_properties = {} if self.label is not None: custom_properties["looker.explore.label"] = str(self.label) if self.source_file is not None: custom_properties["looker.explore.file"] = str(self.source_file) dataset_props = DatasetPropertiesClass( description=self.description, customProperties=custom_properties, ) dataset_props.externalUrl = self._get_url(base_url) dataset_snapshot.aspects.append(dataset_props) if self.upstream_views is not None: assert self.project_name is not None upstreams = [ UpstreamClass( dataset=LookerViewId( project_name=self.project_name, model_name=self.model_name, view_name=view_name, ).get_urn(config), type=DatasetLineageTypeClass.VIEW, ) for view_name in sorted(self.upstream_views) ] upstream_lineage = UpstreamLineage(upstreams=upstreams) dataset_snapshot.aspects.append(upstream_lineage) if self.fields is not None: schema_metadata = LookerUtil._get_schema( platform_name=config.platform_name, schema_name=self.name, view_fields=self.fields, reporter=reporter, ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["explore"]), ) return [mce, mcp]
def gen_containers( container_key: KeyType, name: str, sub_types: List[str], parent_container_key: Optional[PlatformKey] = None, domain_urn: Optional[str] = None, description: Optional[str] = None, owner_urn: Optional[str] = None, external_url: Optional[str] = None, tags: Optional[List[str]] = None, ) -> Iterable[MetadataWorkUnit]: container_urn = make_container_urn(guid=container_key.guid(), ) mcp = MetadataChangeProposalWrapper( entityType="container", changeType=ChangeTypeClass.UPSERT, entityUrn=f"{container_urn}", # entityKeyAspect=ContainerKeyClass(guid=schema_container_key.guid()), aspectName="containerProperties", aspect=ContainerProperties( name=name, description=description, customProperties=container_key.dict(exclude_none=True, by_alias=True), externalUrl=external_url, ), ) wu = MetadataWorkUnit(id=f"container-info-{name}-{container_urn}", mcp=mcp) yield wu mcp = MetadataChangeProposalWrapper( entityType="container", changeType=ChangeTypeClass.UPSERT, entityUrn=f"{container_urn}", # entityKeyAspect=ContainerKeyClass(guid=schema_container_key.guid()), aspectName="dataPlatformInstance", aspect=DataPlatformInstance( platform=f"{make_data_platform_urn(container_key.platform)}", ), ) wu = MetadataWorkUnit( id=f"container-platforminstance-{name}-{container_urn}", mcp=mcp) yield wu # Set subtype subtype_mcp = MetadataChangeProposalWrapper( entityType="container", changeType=ChangeTypeClass.UPSERT, entityUrn=f"{container_urn}", # entityKeyAspect=ContainerKeyClass(guid=schema_container_key.guid()), aspectName="subTypes", aspect=SubTypesClass(typeNames=sub_types), ) wu = MetadataWorkUnit(id=f"container-subtypes-{name}-{container_urn}", mcp=subtype_mcp) yield wu if domain_urn: yield from add_domain_to_entity_wu( entity_type="container", entity_urn=container_urn, domain_urn=domain_urn, ) if owner_urn: yield from add_owner_to_entity_wu( entity_type="container", entity_urn=container_urn, owner_urn=owner_urn, ) if tags: yield from add_tags_to_entity_wu( entity_type="container", entity_urn=container_urn, tags=tags, ) if parent_container_key: parent_container_urn = make_container_urn( guid=parent_container_key.guid(), ) # Set database container parent_container_mcp = MetadataChangeProposalWrapper( entityType="container", changeType=ChangeTypeClass.UPSERT, entityUrn=f"{container_urn}", # entityKeyAspect=ContainerKeyClass(guid=schema_container_key.guid()), aspectName="container", aspect=ContainerClass(container=parent_container_urn), # aspect=ContainerKeyClass(guid=database_container_key.guid()) ) wu = MetadataWorkUnit( id= f"container-parent-container-{name}-{container_urn}-{parent_container_urn}", mcp=parent_container_mcp, ) yield wu
def _extract_record(self, topic: str, partitioned: bool) -> Iterable[MetadataWorkUnit]: logger.info(f"topic = {topic}") # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic pulsar_topic = PulsarTopic(topic) platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=pulsar_topic.fullname, platform_instance=self.config.platform_instance, env=self.config.env, ) status_wu = MetadataWorkUnit( id=f"{dataset_urn}-status", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), ), ) self.report.report_workunit(status_wu) yield status_wu # 2. Emit schemaMetadata aspect schema, schema_metadata = self._get_schema_metadata( pulsar_topic, platform_urn) if schema_metadata is not None: schema_metadata_wu = MetadataWorkUnit( id=f"{dataset_urn}-schemaMetadata", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, ), ) self.report.report_workunit(schema_metadata_wu) yield schema_metadata_wu # TODO Add topic properties (Pulsar 2.10.0 feature) # 3. Construct and emit dataset properties aspect if schema is not None: schema_properties = { "schema_version": str(schema.schema_version), "schema_type": schema.schema_type, "partitioned": str(partitioned).lower(), } # Add some static properties to the schema properties schema.properties.update(schema_properties) dataset_properties_wu = MetadataWorkUnit( id=f"{dataset_urn}-datasetProperties", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( description=schema.schema_description, customProperties=schema.properties, ), ), ) self.report.report_workunit(dataset_properties_wu) yield dataset_properties_wu # 4. Emit browsePaths aspect pulsar_path = ( f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}" ) browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}" if self.config.platform_instance else pulsar_path) browse_path_wu = MetadataWorkUnit( id=f"{dataset_urn}-browsePaths", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="browsePaths", aspect=BrowsePathsClass([ f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}" ]), ), ) self.report.report_workunit(browse_path_wu) yield browse_path_wu # 5. Emit dataPlatformInstance aspect. if self.config.platform_instance: platform_instance_wu = MetadataWorkUnit( id=f"{dataset_urn}-dataPlatformInstance", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="dataPlatformInstance", aspect=DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.config.platform_instance), ), ), ) self.report.report_workunit(platform_instance_wu) yield platform_instance_wu # 6. Emit subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{dataset_urn}-subTypes", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu # 7. Emit domains aspect domain_urn: Optional[str] = None for domain, pattern in self.config.domain.items(): if pattern.allowed(pulsar_topic.fullname): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def _extract_mcps(self, index: str) -> Iterable[MetadataChangeProposalWrapper]: logger.debug(f"index = {index}") raw_index = self.client.indices.get(index=index) raw_index_metadata = raw_index[index] # 0. Dedup data_streams. data_stream = raw_index_metadata.get("data_stream") if data_stream: index = data_stream self.data_stream_partition_count[index] += 1 if self.data_stream_partition_count[index] > 1: # This is a duplicate, skip processing it further. return # 1. Construct and emit the schemaMetadata aspect # 1.1 Generate the schema fields from ES mappings. index_mappings = raw_index_metadata["mappings"] index_mappings_json_str: str = json.dumps(index_mappings) md5_hash = md5(index_mappings_json_str.encode()).hexdigest() schema_fields = list( ElasticToSchemaFieldConverter.get_schema_fields(index_mappings)) # 1.2 Generate the SchemaMetadata aspect schema_metadata = SchemaMetadata( schemaName=index, platform=make_data_platform_urn(self.platform), version=0, hash=md5_hash, platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str), fields=schema_fields, ) # 1.3 Emit the mcp dataset_urn: str = make_dataset_urn(self.platform, index, self.source_config.env) yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="schemaMetadata", aspect=schema_metadata, changeType=ChangeTypeClass.UPSERT, ) # 2. Construct and emit the status aspect. yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="status", aspect=StatusClass(removed=False), changeType=ChangeTypeClass.UPSERT, ) # 3. Construct and emit subtype yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass( typeNames=["Index" if not data_stream else "DataStream"]), changeType=ChangeTypeClass.UPSERT, ) # 4. Construct and emit properties if needed index_aliases = raw_index_metadata.get("aliases", {}).keys() if index_aliases: yield MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, aspectName="datasetProperties", aspect=DatasetPropertiesClass( customProperties={"aliases": ",".join(index_aliases)}), changeType=ChangeTypeClass.UPSERT, )
def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: count_on_query = len(self.custom_sql_ids_being_used) custom_sql_filter = "idWithin: {}".format( json.dumps(self.custom_sql_ids_being_used)) custom_sql_connection, total_count, has_next_page = self.get_connection_object( custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter) current_count = 0 while has_next_page: count = (count_on_query if current_count + count_on_query < total_count else total_count - current_count) ( custom_sql_connection, total_count, has_next_page, ) = self.get_connection_object( custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter, count, current_count, ) current_count += count unique_custom_sql = get_unique_custom_sql( custom_sql_connection.get("nodes", [])) for csql in unique_custom_sql: csql_id: str = csql.get("id", "") csql_urn = builder.make_dataset_urn(self.platform, csql_id, self.config.env) dataset_snapshot = DatasetSnapshot( urn=csql_urn, aspects=[], ) # lineage from datasource -> custom sql source # yield from self._create_lineage_from_csql_datasource( csql_urn, csql.get("datasources", [])) # lineage from custom sql -> datasets/tables # columns = csql.get("columns", []) yield from self._create_lineage_to_upstream_tables( csql_urn, columns) # Schema Metadata schema_metadata = self.get_schema_metadata_for_custom_sql( columns) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) # Browse path browse_paths = BrowsePathsClass(paths=[ f"/{self.config.env.lower()}/{self.platform}/Custom SQL/{csql.get('name', '')}/{csql_id}" ]) dataset_snapshot.aspects.append(browse_paths) dataset_properties = DatasetPropertiesClass( name=csql.get("name"), description=csql.get("description")) dataset_snapshot.aspects.append(dataset_properties) view_properties = ViewPropertiesClass( materialized=False, viewLanguage="SQL", viewLogic=clean_query(csql.get("query", "")), ) dataset_snapshot.aspects.append(view_properties) yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, aspect_name="subTypes", aspect=SubTypesClass(typeNames=["View", "Custom SQL"]), )
def emit_datasource(self, datasource: dict, workbook: dict = None) -> Iterable[MetadataWorkUnit]: datasource_info = workbook if workbook is None: datasource_info = datasource project = (datasource_info.get("projectName", "").replace( "/", REPLACE_SLASH_CHAR) if datasource_info else "") datasource_id = datasource.get("id", "") datasource_name = f"{datasource.get('name')}.{datasource_id}" datasource_urn = builder.make_dataset_urn(self.platform, datasource_id, self.config.env) if datasource_id not in self.datasource_ids_being_used: self.datasource_ids_being_used.append(datasource_id) dataset_snapshot = DatasetSnapshot( urn=datasource_urn, aspects=[], ) # Browse path browse_paths = BrowsePathsClass(paths=[ f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource.get('name', '')}/{datasource_name}" ]) dataset_snapshot.aspects.append(browse_paths) # Ownership owner = (self._get_ownership( datasource_info.get("owner", {}).get("username", "")) if datasource_info else None) if owner is not None: dataset_snapshot.aspects.append(owner) # Dataset properties dataset_props = DatasetPropertiesClass( name=datasource.get("name"), description=datasource.get("description"), customProperties={ "hasExtracts": str(datasource.get("hasExtracts", "")), "extractLastRefreshTime": datasource.get("extractLastRefreshTime", "") or "", "extractLastIncrementalUpdateTime": datasource.get("extractLastIncrementalUpdateTime", "") or "", "extractLastUpdateTime": datasource.get("extractLastUpdateTime", "") or "", "type": datasource.get("__typename", ""), }, ) dataset_snapshot.aspects.append(dataset_props) # Upstream Tables if datasource.get("upstreamTables") is not None: # datasource -> db table relations upstream_tables = self._create_upstream_table_lineage( datasource, project) if upstream_tables: upstream_lineage = UpstreamLineage(upstreams=upstream_tables) yield self.get_metadata_change_proposal( datasource_urn, aspect_name="upstreamLineage", aspect=upstream_lineage, ) # Datasource Fields schema_metadata = self._get_schema_metadata_for_embedded_datasource( datasource.get("fields", [])) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, aspect_name="subTypes", aspect=SubTypesClass(typeNames=["Data Source"]), ) if datasource.get("__typename") == "EmbeddedDatasource": yield from add_entity_to_container(self.gen_workbook_key(workbook), "dataset", dataset_snapshot.urn)
def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: count_on_query = len(self.custom_sql_ids_being_used) custom_sql_filter = "idWithin: {}".format( json.dumps(self.custom_sql_ids_being_used) ) custom_sql_connection, total_count, has_next_page = self.get_connection_object( custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter ) current_count = 0 while has_next_page: count = ( count_on_query if current_count + count_on_query < total_count else total_count - current_count ) ( custom_sql_connection, total_count, has_next_page, ) = self.get_connection_object( custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter, count, current_count, ) current_count += count unique_custom_sql = get_unique_custom_sql( custom_sql_connection.get("nodes", []) ) for csql in unique_custom_sql: csql_id: str = csql["id"] csql_urn = builder.make_dataset_urn( self.platform, csql_id, self.config.env ) dataset_snapshot = DatasetSnapshot( urn=csql_urn, aspects=[], ) datasource_name = None project = None if len(csql["datasources"]) > 0: yield from self._create_lineage_from_csql_datasource( csql_urn, csql["datasources"] ) # CustomSQLTable id owned by exactly one tableau data source logger.debug( f"Number of datasources referencing CustomSQLTable: {len(csql['datasources'])}" ) datasource = csql["datasources"][0] datasource_name = datasource.get("name") if datasource.get( "__typename" ) == "EmbeddedDatasource" and datasource.get("workbook"): datasource_name = ( f"{datasource.get('workbook').get('name')}/{datasource_name}" if datasource_name and datasource.get("workbook").get("name") else None ) yield from add_entity_to_container( self.gen_workbook_key(datasource["workbook"]), "dataset", dataset_snapshot.urn, ) project = self._get_project(datasource) # lineage from custom sql -> datasets/tables # columns = csql.get("columns", []) yield from self._create_lineage_to_upstream_tables(csql_urn, columns) # Schema Metadata schema_metadata = self.get_schema_metadata_for_custom_sql(columns) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) # Browse path csql_name = csql.get("name") if csql.get("name") else csql_id if project and datasource_name: browse_paths = BrowsePathsClass( paths=[ f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource['name']}/{csql_name}" ] ) dataset_snapshot.aspects.append(browse_paths) else: logger.debug(f"Browse path not set for Custom SQL table {csql_id}") dataset_properties = DatasetPropertiesClass( name=csql.get("name"), description=csql.get("description") ) dataset_snapshot.aspects.append(dataset_properties) view_properties = ViewPropertiesClass( materialized=False, viewLanguage="SQL", viewLogic=clean_query(csql.get("query", "")), ) dataset_snapshot.aspects.append(view_properties) yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, aspect_name="subTypes", aspect=SubTypesClass(typeNames=["View", "Custom SQL"]), )
def _extract_record( self, topic: str) -> Iterable[MetadataWorkUnit]: # noqa: C901 logger.debug(f"topic = {topic}") # 1. Create the default dataset snapshot for the topic. dataset_name = topic platform_urn = make_data_platform_urn(self.platform) dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, name=dataset_name, platform_instance=self.source_config.platform_instance, env=self.source_config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[Status(removed=False)], # we append to this list later on ) # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry) schema_metadata = self.schema_registry_client.get_schema_metadata( topic, platform_urn) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) # 3. Attach browsePaths aspect browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}" if self.source_config.platform_instance else topic) browse_path = BrowsePathsClass([ f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}" ]) dataset_snapshot.aspects.append(browse_path) # 4. Attach dataPlatformInstance aspect. if self.source_config.platform_instance: dataset_snapshot.aspects.append( DataPlatformInstanceClass( platform=platform_urn, instance=make_dataplatform_instance_urn( self.platform, self.source_config.platform_instance), )) # 5. Emit the datasetSnapshot MCE mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce) self.report.report_workunit(wu) yield wu # 5. Add the subtype aspect marking this as a "topic" subtype_wu = MetadataWorkUnit( id=f"{topic}-subtype", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["topic"]), ), ) self.report.report_workunit(subtype_wu) yield subtype_wu domain_urn: Optional[str] = None # 6. Emit domains aspect MCPW for domain, pattern in self.source_config.domain.items(): if pattern.allowed(dataset_name): domain_urn = make_domain_urn(domain) if domain_urn: wus = add_domain_to_entity_wu( entity_type="dataset", entity_urn=dataset_urn, domain_urn=domain_urn, ) for wu in wus: self.report.report_workunit(wu) yield wu
def _process_table( self, dataset_name: str, inspector: Inspector, schema: str, table: str, sql_config: SQLAlchemyConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: columns = self._get_columns(dataset_name, inspector, schema, table) dataset_urn = make_dataset_urn_with_platform_instance( self.platform, dataset_name, self.config.platform_instance, self.config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[StatusClass(removed=False)], ) if self.is_stateful_ingestion_configured(): cur_checkpoint = self.get_current_checkpoint( self.get_default_ingestion_job_id()) if cur_checkpoint is not None: checkpoint_state = cast(BaseSQLAlchemyCheckpointState, cur_checkpoint.state) checkpoint_state.add_table_urn(dataset_urn) description, properties, location_urn = self.get_table_properties( inspector, schema, table) dataset_properties = DatasetPropertiesClass( name=table, description=description, customProperties=properties, ) dataset_snapshot.aspects.append(dataset_properties) if location_urn: external_upstream_table = UpstreamClass( dataset=location_urn, type=DatasetLineageTypeClass.COPY, ) lineage_mcpw = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_snapshot.urn, aspectName="upstreamLineage", aspect=UpstreamLineage(upstreams=[external_upstream_table]), ) lineage_wu = MetadataWorkUnit( id= f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}", mcp=lineage_mcpw, ) yield lineage_wu pk_constraints: dict = inspector.get_pk_constraint(table, schema) foreign_keys = self._get_foreign_keys(dataset_urn, inspector, schema, table) schema_fields = self.get_schema_fields(dataset_name, columns, pk_constraints) schema_metadata = get_schema_metadata( self.report, dataset_name, self.platform, columns, pk_constraints, foreign_keys, schema_fields, ) dataset_snapshot.aspects.append(schema_metadata) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container(dataset_urn, db_name, schema) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu dpi_aspect = self.get_dataplatform_instance_aspect( dataset_urn=dataset_urn) if dpi_aspect: yield dpi_aspect subtypes_aspect = MetadataWorkUnit( id=f"{dataset_name}-subtypes", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["table"]), ), ) yield subtypes_aspect yield from self._get_domain_wu( dataset_name=dataset_name, entity_urn=dataset_urn, entity_type="dataset", sql_config=sql_config, )
def _process_view( self, dataset_name: str, inspector: Inspector, schema: str, view: str, sql_config: SQLAlchemyConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: try: columns = inspector.get_columns(view, schema) except KeyError: # For certain types of views, we are unable to fetch the list of columns. self.report.report_warning(dataset_name, "unable to get schema for this view") schema_metadata = None else: schema_fields = self.get_schema_fields(dataset_name, columns) schema_metadata = get_schema_metadata( self.report, dataset_name, self.platform, columns, canonical_schema=schema_fields, ) try: # SQLALchemy stubs are incomplete and missing this method. # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223. view_info: dict = inspector.get_table_comment( view, schema) # type: ignore except NotImplementedError: description: Optional[str] = None properties: Dict[str, str] = {} else: description = view_info["text"] # The "properties" field is a non-standard addition to SQLAlchemy's interface. properties = view_info.get("properties", {}) try: view_definition = inspector.get_view_definition(view, schema) if view_definition is None: view_definition = "" else: # Some dialects return a TextClause instead of a raw string, # so we need to convert them to a string. view_definition = str(view_definition) except NotImplementedError: view_definition = "" properties["view_definition"] = view_definition properties["is_view"] = "True" dataset_urn = make_dataset_urn_with_platform_instance( self.platform, dataset_name, self.config.platform_instance, self.config.env, ) dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[StatusClass(removed=False)], ) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container(dataset_urn, db_name, schema) if self.is_stateful_ingestion_configured(): cur_checkpoint = self.get_current_checkpoint( self.get_default_ingestion_job_id()) if cur_checkpoint is not None: checkpoint_state = cast(BaseSQLAlchemyCheckpointState, cur_checkpoint.state) checkpoint_state.add_view_urn(dataset_urn) dataset_properties = DatasetPropertiesClass( name=view, description=description, customProperties=properties, ) dataset_snapshot.aspects.append(dataset_properties) if schema_metadata: dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu dpi_aspect = self.get_dataplatform_instance_aspect( dataset_urn=dataset_urn) if dpi_aspect: yield dpi_aspect subtypes_aspect = MetadataWorkUnit( id=f"{view}-subtypes", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["view"]), ), ) yield subtypes_aspect if "view_definition" in properties: view_definition_string = properties["view_definition"] view_properties_aspect = ViewPropertiesClass( materialized=False, viewLanguage="SQL", viewLogic=view_definition_string) yield MetadataWorkUnit( id=f"{view}-viewProperties", mcp=MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=dataset_urn, aspectName="viewProperties", aspect=view_properties_aspect, ), ) yield from self._get_domain_wu( dataset_name=dataset_name, entity_urn=dataset_urn, entity_type="dataset", sql_config=sql_config, )
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]: def strip_types(field_path: str) -> str: final_path = field_path final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path) final_path = re.sub(r"^\[version=2.0\]\.", "", final_path) return final_path datasets: List[DatasetSnapshotClass] = [] for entity_name, entity_def in entity_registry.items(): entity_display_name = entity_def.display_name entity_fields = [] for aspect_name in entity_def.aspects: if aspect_name not in aspect_registry: print( f"Did not find aspect name: {aspect_name} in aspect_registry" ) continue # all aspects should have a schema aspect_schema = aspect_registry[aspect_name].schema assert aspect_schema entity_fields.append({ "type": aspect_schema.to_json(), "name": aspect_name, }) if entity_fields: names = avro.schema.Names() field_objects = [] for f in entity_fields: field = avro.schema.Field( type=f["type"], name=f["name"], has_default=False, ) field_objects.append(field) with unittest.mock.patch("avro.schema.Names.add_name", add_name): entity_avro_schema = avro.schema.RecordSchema( name=entity_name, namespace="datahub.metadata.model", names=names, fields=[], ) entity_avro_schema.set_prop("fields", field_objects) rawSchema = json.dumps(entity_avro_schema.to_json()) # always add the URN which is the primary key urn_field = SchemaField( fieldPath="urn", type=SchemaFieldDataTypeClass(type=StringTypeClass()), nativeDataType="string", nullable=False, isPartOfKey=True, description= f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.", ) schema_fields: List[SchemaField] = [ urn_field ] + avro_schema_to_mce_fields(rawSchema) foreign_keys: List[ForeignKeyConstraintClass] = [] source_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ) for f_field in schema_fields: if f_field.jsonProps: json_dict = json.loads(f_field.jsonProps) if "Aspect" in json_dict: aspect_info = json_dict["Aspect"] f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Aspect")) # if this is the key aspect, also add primary-key if entity_def.keyAspect == aspect_info.get("name"): f_field.isPartOfKey = True if "timeseries" == aspect_info.get("type", ""): # f_field.globalTags = f_field.globalTags or GlobalTagsClass( # tags=[] # ) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Temporal")) import pdb # breakpoint() if "Searchable" in json_dict: f_field.globalTags = f_field.globalTags or GlobalTagsClass( tags=[]) f_field.globalTags.tags.append( TagAssociationClass(tag="urn:li:tag:Searchable")) if "Relationship" in json_dict: relationship_info = json_dict["Relationship"] # detect if we have relationship specified at leaf level or thru path specs if "entityTypes" not in relationship_info: # path spec assert ( len(relationship_info.keys()) == 1 ), "We should never have more than one path spec assigned to a relationship annotation" final_info = None for k, v in relationship_info.items(): final_info = v relationship_info = final_info assert "entityTypes" in relationship_info entity_types: List[str] = relationship_info.get( "entityTypes", []) relnship_name = relationship_info.get("name", None) for entity_type in entity_types: destination_entity_name = capitalize_first( entity_type) foreign_dataset_urn = make_dataset_urn( platform=make_data_platform_urn("datahub"), name=destination_entity_name, ) fkey = ForeignKeyConstraintClass( name=relnship_name, foreignDataset=foreign_dataset_urn, foreignFields=[ f"urn:li:schemaField:({foreign_dataset_urn}, urn)" ], sourceFields=[ f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})" ], ) foreign_keys.append(fkey) relnships_graph.add_edge( entity_display_name, destination_entity_name, fkey.name, f" via `{strip_types(f_field.fieldPath)}`", edge_id= f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}", ) schemaMetadata = SchemaMetadataClass( schemaName=f"{entity_name}", platform=make_data_platform_urn("datahub"), platformSchema=OtherSchemaClass(rawSchema=rawSchema), fields=schema_fields, version=0, hash="", foreignKeys=foreign_keys if foreign_keys else None, ) dataset = DatasetSnapshotClass( urn=make_dataset_urn( platform=make_data_platform_urn("datahub"), name=f"{entity_display_name}", ), aspects=[ schemaMetadata, GlobalTagsClass( tags=[TagAssociationClass(tag="urn:li:tag:Entity")]), BrowsePathsClass( [f"/prod/datahub/entities/{entity_display_name}"]), ], ) datasets.append(dataset) events: List[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] = [] for d in datasets: entity_name = d.urn.split(":")[-1].split(",")[1] d.aspects.append( DatasetPropertiesClass( description=make_entity_docs(entity_name, relnships_graph))) mce = MetadataChangeEventClass(proposedSnapshot=d, ) events.append(mce) mcp = MetadataChangeProposalWrapper( entityType="dataset", changeType=ChangeTypeClass.UPSERT, entityUrn=d.urn, aspectName="subTypes", aspect=SubTypesClass(typeNames=["entity"]), ) events.append(mcp) return events