Ejemplo n.º 1
0
 def _create_subType_wu(
         self, node: DBTNode,
         node_datahub_urn: str) -> Optional[MetadataWorkUnit]:
     if not node.node_type:
         return None
     subtypes: Optional[List[str]]
     if node.node_type == "model":
         if node.materialization:
             subtypes = [node.materialization, "view"]
         else:
             subtypes = ["model", "view"]
     else:
         subtypes = [node.node_type]
     subtype_mcp = MetadataChangeProposalWrapper(
         entityType="dataset",
         changeType=ChangeTypeClass.UPSERT,
         entityUrn=node_datahub_urn,
         aspectName="subTypes",
         aspect=SubTypesClass(typeNames=subtypes),
     )
     subtype_wu = MetadataWorkUnit(
         id=
         f"{self.platform}-{subtype_mcp.entityUrn}-{subtype_mcp.aspectName}",
         mcp=subtype_mcp,
     )
     return subtype_wu
Ejemplo n.º 2
0
    def _build_dataset_mcps(
            self,
            looker_view: LookerView) -> List[MetadataChangeProposalWrapper]:
        events = []
        subTypeEvent = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=looker_view.id.get_urn(self.source_config),
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=["view"]),
        )
        events.append(subTypeEvent)
        if looker_view.view_details is not None:
            viewEvent = MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=looker_view.id.get_urn(self.source_config),
                aspectName="viewProperties",
                aspect=looker_view.view_details,
            )
            events.append(viewEvent)

        return events
Ejemplo n.º 3
0
    def _to_metadata_events(  # noqa: C901
        self, config: LookerCommonConfig, reporter: SourceReport,
        base_url: str) -> Optional[List[Union[MetadataChangeEvent,
                                              MetadataChangeProposalWrapper]]]:
        # We only generate MCE-s for explores that contain from clauses and do NOT contain joins
        # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph.

        dataset_snapshot = DatasetSnapshot(
            urn=self.get_explore_urn(config),
            aspects=[],  # we append to this list later on
        )
        browse_paths = BrowsePathsClass(
            paths=[self.get_explore_browse_path(config)])
        dataset_snapshot.aspects.append(browse_paths)
        dataset_snapshot.aspects.append(StatusClass(removed=False))

        custom_properties = {}
        if self.label is not None:
            custom_properties["looker.explore.label"] = str(self.label)
        if self.source_file is not None:
            custom_properties["looker.explore.file"] = str(self.source_file)
        dataset_props = DatasetPropertiesClass(
            description=self.description,
            customProperties=custom_properties,
        )
        dataset_props.externalUrl = self._get_url(base_url)

        dataset_snapshot.aspects.append(dataset_props)
        if self.upstream_views is not None:
            assert self.project_name is not None
            upstreams = [
                UpstreamClass(
                    dataset=LookerViewId(
                        project_name=self.project_name,
                        model_name=self.model_name,
                        view_name=view_name,
                    ).get_urn(config),
                    type=DatasetLineageTypeClass.VIEW,
                ) for view_name in sorted(self.upstream_views)
            ]
            upstream_lineage = UpstreamLineage(upstreams=upstreams)
            dataset_snapshot.aspects.append(upstream_lineage)
        if self.fields is not None:
            schema_metadata = LookerUtil._get_schema(
                platform_name=config.platform_name,
                schema_name=self.name,
                view_fields=self.fields,
                reporter=reporter,
            )
            if schema_metadata is not None:
                dataset_snapshot.aspects.append(schema_metadata)

        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=dataset_snapshot.urn,
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=["explore"]),
        )

        return [mce, mcp]
Ejemplo n.º 4
0
def gen_containers(
    container_key: KeyType,
    name: str,
    sub_types: List[str],
    parent_container_key: Optional[PlatformKey] = None,
    domain_urn: Optional[str] = None,
    description: Optional[str] = None,
    owner_urn: Optional[str] = None,
    external_url: Optional[str] = None,
    tags: Optional[List[str]] = None,
) -> Iterable[MetadataWorkUnit]:
    container_urn = make_container_urn(guid=container_key.guid(), )
    mcp = MetadataChangeProposalWrapper(
        entityType="container",
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=f"{container_urn}",
        # entityKeyAspect=ContainerKeyClass(guid=schema_container_key.guid()),
        aspectName="containerProperties",
        aspect=ContainerProperties(
            name=name,
            description=description,
            customProperties=container_key.dict(exclude_none=True,
                                                by_alias=True),
            externalUrl=external_url,
        ),
    )
    wu = MetadataWorkUnit(id=f"container-info-{name}-{container_urn}", mcp=mcp)
    yield wu

    mcp = MetadataChangeProposalWrapper(
        entityType="container",
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=f"{container_urn}",
        # entityKeyAspect=ContainerKeyClass(guid=schema_container_key.guid()),
        aspectName="dataPlatformInstance",
        aspect=DataPlatformInstance(
            platform=f"{make_data_platform_urn(container_key.platform)}", ),
    )
    wu = MetadataWorkUnit(
        id=f"container-platforminstance-{name}-{container_urn}", mcp=mcp)
    yield wu

    # Set subtype
    subtype_mcp = MetadataChangeProposalWrapper(
        entityType="container",
        changeType=ChangeTypeClass.UPSERT,
        entityUrn=f"{container_urn}",
        # entityKeyAspect=ContainerKeyClass(guid=schema_container_key.guid()),
        aspectName="subTypes",
        aspect=SubTypesClass(typeNames=sub_types),
    )
    wu = MetadataWorkUnit(id=f"container-subtypes-{name}-{container_urn}",
                          mcp=subtype_mcp)
    yield wu

    if domain_urn:
        yield from add_domain_to_entity_wu(
            entity_type="container",
            entity_urn=container_urn,
            domain_urn=domain_urn,
        )

    if owner_urn:
        yield from add_owner_to_entity_wu(
            entity_type="container",
            entity_urn=container_urn,
            owner_urn=owner_urn,
        )

    if tags:
        yield from add_tags_to_entity_wu(
            entity_type="container",
            entity_urn=container_urn,
            tags=tags,
        )

    if parent_container_key:
        parent_container_urn = make_container_urn(
            guid=parent_container_key.guid(), )

        # Set database container
        parent_container_mcp = MetadataChangeProposalWrapper(
            entityType="container",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=f"{container_urn}",
            # entityKeyAspect=ContainerKeyClass(guid=schema_container_key.guid()),
            aspectName="container",
            aspect=ContainerClass(container=parent_container_urn),
            # aspect=ContainerKeyClass(guid=database_container_key.guid())
        )
        wu = MetadataWorkUnit(
            id=
            f"container-parent-container-{name}-{container_urn}-{parent_container_urn}",
            mcp=parent_container_mcp,
        )

        yield wu
Ejemplo n.º 5
0
    def _extract_record(self, topic: str,
                        partitioned: bool) -> Iterable[MetadataWorkUnit]:
        logger.info(f"topic = {topic}")

        # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace
        # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic
        pulsar_topic = PulsarTopic(topic)

        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=pulsar_topic.fullname,
            platform_instance=self.config.platform_instance,
            env=self.config.env,
        )

        status_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-status",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="status",
                aspect=StatusClass(removed=False),
            ),
        )
        self.report.report_workunit(status_wu)
        yield status_wu

        # 2. Emit schemaMetadata aspect
        schema, schema_metadata = self._get_schema_metadata(
            pulsar_topic, platform_urn)
        if schema_metadata is not None:
            schema_metadata_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-schemaMetadata",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="schemaMetadata",
                    aspect=schema_metadata,
                ),
            )
            self.report.report_workunit(schema_metadata_wu)
            yield schema_metadata_wu

        # TODO Add topic properties (Pulsar 2.10.0 feature)
        # 3. Construct and emit dataset properties aspect
        if schema is not None:
            schema_properties = {
                "schema_version": str(schema.schema_version),
                "schema_type": schema.schema_type,
                "partitioned": str(partitioned).lower(),
            }
            # Add some static properties to the schema properties
            schema.properties.update(schema_properties)

            dataset_properties_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-datasetProperties",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="datasetProperties",
                    aspect=DatasetPropertiesClass(
                        description=schema.schema_description,
                        customProperties=schema.properties,
                    ),
                ),
            )
            self.report.report_workunit(dataset_properties_wu)
            yield dataset_properties_wu

        # 4. Emit browsePaths aspect
        pulsar_path = (
            f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}"
        )
        browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}"
                              if self.config.platform_instance else
                              pulsar_path)

        browse_path_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-browsePaths",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="browsePaths",
                aspect=BrowsePathsClass([
                    f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}"
                ]),
            ),
        )
        self.report.report_workunit(browse_path_wu)
        yield browse_path_wu

        # 5. Emit dataPlatformInstance aspect.
        if self.config.platform_instance:
            platform_instance_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-dataPlatformInstance",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=platform_urn,
                        instance=make_dataplatform_instance_urn(
                            self.platform, self.config.platform_instance),
                    ),
                ),
            )
            self.report.report_workunit(platform_instance_wu)
            yield platform_instance_wu

        # 6. Emit subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-subTypes",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        # 7. Emit domains aspect
        domain_urn: Optional[str] = None
        for domain, pattern in self.config.domain.items():
            if pattern.allowed(pulsar_topic.fullname):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu
Ejemplo n.º 6
0
    def _extract_mcps(self,
                      index: str) -> Iterable[MetadataChangeProposalWrapper]:
        logger.debug(f"index = {index}")
        raw_index = self.client.indices.get(index=index)
        raw_index_metadata = raw_index[index]

        # 0. Dedup data_streams.
        data_stream = raw_index_metadata.get("data_stream")
        if data_stream:
            index = data_stream
            self.data_stream_partition_count[index] += 1
            if self.data_stream_partition_count[index] > 1:
                # This is a duplicate, skip processing it further.
                return

        # 1. Construct and emit the schemaMetadata aspect
        # 1.1 Generate the schema fields from ES mappings.
        index_mappings = raw_index_metadata["mappings"]
        index_mappings_json_str: str = json.dumps(index_mappings)
        md5_hash = md5(index_mappings_json_str.encode()).hexdigest()
        schema_fields = list(
            ElasticToSchemaFieldConverter.get_schema_fields(index_mappings))

        # 1.2 Generate the SchemaMetadata aspect
        schema_metadata = SchemaMetadata(
            schemaName=index,
            platform=make_data_platform_urn(self.platform),
            version=0,
            hash=md5_hash,
            platformSchema=OtherSchemaClass(rawSchema=index_mappings_json_str),
            fields=schema_fields,
        )

        # 1.3 Emit the mcp
        dataset_urn: str = make_dataset_urn(self.platform, index,
                                            self.source_config.env)
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="schemaMetadata",
            aspect=schema_metadata,
            changeType=ChangeTypeClass.UPSERT,
        )

        # 2. Construct and emit the status aspect.
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="status",
            aspect=StatusClass(removed=False),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 3. Construct and emit subtype
        yield MetadataChangeProposalWrapper(
            entityType="dataset",
            entityUrn=dataset_urn,
            aspectName="subTypes",
            aspect=SubTypesClass(
                typeNames=["Index" if not data_stream else "DataStream"]),
            changeType=ChangeTypeClass.UPSERT,
        )

        # 4. Construct and emit properties if needed
        index_aliases = raw_index_metadata.get("aliases", {}).keys()
        if index_aliases:
            yield MetadataChangeProposalWrapper(
                entityType="dataset",
                entityUrn=dataset_urn,
                aspectName="datasetProperties",
                aspect=DatasetPropertiesClass(
                    customProperties={"aliases": ",".join(index_aliases)}),
                changeType=ChangeTypeClass.UPSERT,
            )
Ejemplo n.º 7
0
    def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
        count_on_query = len(self.custom_sql_ids_being_used)
        custom_sql_filter = "idWithin: {}".format(
            json.dumps(self.custom_sql_ids_being_used))
        custom_sql_connection, total_count, has_next_page = self.get_connection_object(
            custom_sql_graphql_query, "customSQLTablesConnection",
            custom_sql_filter)

        current_count = 0
        while has_next_page:
            count = (count_on_query if current_count +
                     count_on_query < total_count else total_count -
                     current_count)
            (
                custom_sql_connection,
                total_count,
                has_next_page,
            ) = self.get_connection_object(
                custom_sql_graphql_query,
                "customSQLTablesConnection",
                custom_sql_filter,
                count,
                current_count,
            )
            current_count += count

            unique_custom_sql = get_unique_custom_sql(
                custom_sql_connection.get("nodes", []))
            for csql in unique_custom_sql:
                csql_id: str = csql.get("id", "")
                csql_urn = builder.make_dataset_urn(self.platform, csql_id,
                                                    self.config.env)
                dataset_snapshot = DatasetSnapshot(
                    urn=csql_urn,
                    aspects=[],
                )

                # lineage from datasource -> custom sql source #
                yield from self._create_lineage_from_csql_datasource(
                    csql_urn, csql.get("datasources", []))

                # lineage from custom sql -> datasets/tables #
                columns = csql.get("columns", [])
                yield from self._create_lineage_to_upstream_tables(
                    csql_urn, columns)

                #  Schema Metadata
                schema_metadata = self.get_schema_metadata_for_custom_sql(
                    columns)
                if schema_metadata is not None:
                    dataset_snapshot.aspects.append(schema_metadata)

                # Browse path
                browse_paths = BrowsePathsClass(paths=[
                    f"/{self.config.env.lower()}/{self.platform}/Custom SQL/{csql.get('name', '')}/{csql_id}"
                ])
                dataset_snapshot.aspects.append(browse_paths)

                dataset_properties = DatasetPropertiesClass(
                    name=csql.get("name"), description=csql.get("description"))

                dataset_snapshot.aspects.append(dataset_properties)

                view_properties = ViewPropertiesClass(
                    materialized=False,
                    viewLanguage="SQL",
                    viewLogic=clean_query(csql.get("query", "")),
                )
                dataset_snapshot.aspects.append(view_properties)

                yield self.get_metadata_change_event(dataset_snapshot)
                yield self.get_metadata_change_proposal(
                    dataset_snapshot.urn,
                    aspect_name="subTypes",
                    aspect=SubTypesClass(typeNames=["View", "Custom SQL"]),
                )
Ejemplo n.º 8
0
    def emit_datasource(self,
                        datasource: dict,
                        workbook: dict = None) -> Iterable[MetadataWorkUnit]:
        datasource_info = workbook
        if workbook is None:
            datasource_info = datasource

        project = (datasource_info.get("projectName", "").replace(
            "/", REPLACE_SLASH_CHAR) if datasource_info else "")
        datasource_id = datasource.get("id", "")
        datasource_name = f"{datasource.get('name')}.{datasource_id}"
        datasource_urn = builder.make_dataset_urn(self.platform, datasource_id,
                                                  self.config.env)
        if datasource_id not in self.datasource_ids_being_used:
            self.datasource_ids_being_used.append(datasource_id)

        dataset_snapshot = DatasetSnapshot(
            urn=datasource_urn,
            aspects=[],
        )

        # Browse path
        browse_paths = BrowsePathsClass(paths=[
            f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource.get('name', '')}/{datasource_name}"
        ])
        dataset_snapshot.aspects.append(browse_paths)

        # Ownership
        owner = (self._get_ownership(
            datasource_info.get("owner", {}).get("username", ""))
                 if datasource_info else None)
        if owner is not None:
            dataset_snapshot.aspects.append(owner)

        # Dataset properties
        dataset_props = DatasetPropertiesClass(
            name=datasource.get("name"),
            description=datasource.get("description"),
            customProperties={
                "hasExtracts":
                str(datasource.get("hasExtracts", "")),
                "extractLastRefreshTime":
                datasource.get("extractLastRefreshTime", "") or "",
                "extractLastIncrementalUpdateTime":
                datasource.get("extractLastIncrementalUpdateTime", "") or "",
                "extractLastUpdateTime":
                datasource.get("extractLastUpdateTime", "") or "",
                "type":
                datasource.get("__typename", ""),
            },
        )
        dataset_snapshot.aspects.append(dataset_props)

        # Upstream Tables
        if datasource.get("upstreamTables") is not None:
            # datasource -> db table relations
            upstream_tables = self._create_upstream_table_lineage(
                datasource, project)

            if upstream_tables:
                upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
                yield self.get_metadata_change_proposal(
                    datasource_urn,
                    aspect_name="upstreamLineage",
                    aspect=upstream_lineage,
                )

        # Datasource Fields
        schema_metadata = self._get_schema_metadata_for_embedded_datasource(
            datasource.get("fields", []))
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)

        yield self.get_metadata_change_event(dataset_snapshot)
        yield self.get_metadata_change_proposal(
            dataset_snapshot.urn,
            aspect_name="subTypes",
            aspect=SubTypesClass(typeNames=["Data Source"]),
        )

        if datasource.get("__typename") == "EmbeddedDatasource":
            yield from add_entity_to_container(self.gen_workbook_key(workbook),
                                               "dataset", dataset_snapshot.urn)
Ejemplo n.º 9
0
    def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
        count_on_query = len(self.custom_sql_ids_being_used)
        custom_sql_filter = "idWithin: {}".format(
            json.dumps(self.custom_sql_ids_being_used)
        )
        custom_sql_connection, total_count, has_next_page = self.get_connection_object(
            custom_sql_graphql_query, "customSQLTablesConnection", custom_sql_filter
        )

        current_count = 0
        while has_next_page:
            count = (
                count_on_query
                if current_count + count_on_query < total_count
                else total_count - current_count
            )
            (
                custom_sql_connection,
                total_count,
                has_next_page,
            ) = self.get_connection_object(
                custom_sql_graphql_query,
                "customSQLTablesConnection",
                custom_sql_filter,
                count,
                current_count,
            )
            current_count += count

            unique_custom_sql = get_unique_custom_sql(
                custom_sql_connection.get("nodes", [])
            )
            for csql in unique_custom_sql:
                csql_id: str = csql["id"]
                csql_urn = builder.make_dataset_urn(
                    self.platform, csql_id, self.config.env
                )
                dataset_snapshot = DatasetSnapshot(
                    urn=csql_urn,
                    aspects=[],
                )

                datasource_name = None
                project = None
                if len(csql["datasources"]) > 0:
                    yield from self._create_lineage_from_csql_datasource(
                        csql_urn, csql["datasources"]
                    )

                    # CustomSQLTable id owned by exactly one tableau data source
                    logger.debug(
                        f"Number of datasources referencing CustomSQLTable: {len(csql['datasources'])}"
                    )

                    datasource = csql["datasources"][0]
                    datasource_name = datasource.get("name")
                    if datasource.get(
                        "__typename"
                    ) == "EmbeddedDatasource" and datasource.get("workbook"):
                        datasource_name = (
                            f"{datasource.get('workbook').get('name')}/{datasource_name}"
                            if datasource_name
                            and datasource.get("workbook").get("name")
                            else None
                        )
                        yield from add_entity_to_container(
                            self.gen_workbook_key(datasource["workbook"]),
                            "dataset",
                            dataset_snapshot.urn,
                        )
                    project = self._get_project(datasource)

                # lineage from custom sql -> datasets/tables #
                columns = csql.get("columns", [])
                yield from self._create_lineage_to_upstream_tables(csql_urn, columns)

                #  Schema Metadata
                schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
                if schema_metadata is not None:
                    dataset_snapshot.aspects.append(schema_metadata)

                # Browse path
                csql_name = csql.get("name") if csql.get("name") else csql_id

                if project and datasource_name:
                    browse_paths = BrowsePathsClass(
                        paths=[
                            f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource['name']}/{csql_name}"
                        ]
                    )
                    dataset_snapshot.aspects.append(browse_paths)
                else:
                    logger.debug(f"Browse path not set for Custom SQL table {csql_id}")

                dataset_properties = DatasetPropertiesClass(
                    name=csql.get("name"), description=csql.get("description")
                )

                dataset_snapshot.aspects.append(dataset_properties)

                view_properties = ViewPropertiesClass(
                    materialized=False,
                    viewLanguage="SQL",
                    viewLogic=clean_query(csql.get("query", "")),
                )
                dataset_snapshot.aspects.append(view_properties)

                yield self.get_metadata_change_event(dataset_snapshot)
                yield self.get_metadata_change_proposal(
                    dataset_snapshot.urn,
                    aspect_name="subTypes",
                    aspect=SubTypesClass(typeNames=["View", "Custom SQL"]),
                )
Ejemplo n.º 10
0
    def _extract_record(
            self, topic: str) -> Iterable[MetadataWorkUnit]:  # noqa: C901
        logger.debug(f"topic = {topic}")

        # 1. Create the default dataset snapshot for the topic.
        dataset_name = topic
        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=dataset_name,
            platform_instance=self.source_config.platform_instance,
            env=self.source_config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[Status(removed=False)],  # we append to this list later on
        )

        # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry)
        schema_metadata = self.schema_registry_client.get_schema_metadata(
            topic, platform_urn)
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)

        # 3. Attach browsePaths aspect
        browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}"
                              if self.source_config.platform_instance else
                              topic)
        browse_path = BrowsePathsClass([
            f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}"
        ])
        dataset_snapshot.aspects.append(browse_path)

        # 4. Attach dataPlatformInstance aspect.
        if self.source_config.platform_instance:
            dataset_snapshot.aspects.append(
                DataPlatformInstanceClass(
                    platform=platform_urn,
                    instance=make_dataplatform_instance_urn(
                        self.platform, self.source_config.platform_instance),
                ))

        # 5. Emit the datasetSnapshot MCE
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce)
        self.report.report_workunit(wu)
        yield wu

        # 5. Add the subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{topic}-subtype",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        domain_urn: Optional[str] = None

        # 6. Emit domains aspect MCPW
        for domain, pattern in self.source_config.domain.items():
            if pattern.allowed(dataset_name):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu
Ejemplo n.º 11
0
    def _process_table(
        self,
        dataset_name: str,
        inspector: Inspector,
        schema: str,
        table: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
        columns = self._get_columns(dataset_name, inspector, schema, table)
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.platform,
            dataset_name,
            self.config.platform_instance,
            self.config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[StatusClass(removed=False)],
        )
        if self.is_stateful_ingestion_configured():
            cur_checkpoint = self.get_current_checkpoint(
                self.get_default_ingestion_job_id())
            if cur_checkpoint is not None:
                checkpoint_state = cast(BaseSQLAlchemyCheckpointState,
                                        cur_checkpoint.state)
                checkpoint_state.add_table_urn(dataset_urn)

        description, properties, location_urn = self.get_table_properties(
            inspector, schema, table)
        dataset_properties = DatasetPropertiesClass(
            name=table,
            description=description,
            customProperties=properties,
        )
        dataset_snapshot.aspects.append(dataset_properties)

        if location_urn:
            external_upstream_table = UpstreamClass(
                dataset=location_urn,
                type=DatasetLineageTypeClass.COPY,
            )
            lineage_mcpw = MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_snapshot.urn,
                aspectName="upstreamLineage",
                aspect=UpstreamLineage(upstreams=[external_upstream_table]),
            )
            lineage_wu = MetadataWorkUnit(
                id=
                f"{self.platform}-{lineage_mcpw.entityUrn}-{lineage_mcpw.aspectName}",
                mcp=lineage_mcpw,
            )
            yield lineage_wu

        pk_constraints: dict = inspector.get_pk_constraint(table, schema)
        foreign_keys = self._get_foreign_keys(dataset_urn, inspector, schema,
                                              table)
        schema_fields = self.get_schema_fields(dataset_name, columns,
                                               pk_constraints)
        schema_metadata = get_schema_metadata(
            self.report,
            dataset_name,
            self.platform,
            columns,
            pk_constraints,
            foreign_keys,
            schema_fields,
        )
        dataset_snapshot.aspects.append(schema_metadata)
        db_name = self.get_db_name(inspector)
        yield from self.add_table_to_schema_container(dataset_urn, db_name,
                                                      schema)
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = SqlWorkUnit(id=dataset_name, mce=mce)
        self.report.report_workunit(wu)
        yield wu
        dpi_aspect = self.get_dataplatform_instance_aspect(
            dataset_urn=dataset_urn)
        if dpi_aspect:
            yield dpi_aspect
        subtypes_aspect = MetadataWorkUnit(
            id=f"{dataset_name}-subtypes",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["table"]),
            ),
        )
        yield subtypes_aspect

        yield from self._get_domain_wu(
            dataset_name=dataset_name,
            entity_urn=dataset_urn,
            entity_type="dataset",
            sql_config=sql_config,
        )
Ejemplo n.º 12
0
    def _process_view(
        self,
        dataset_name: str,
        inspector: Inspector,
        schema: str,
        view: str,
        sql_config: SQLAlchemyConfig,
    ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]:
        try:
            columns = inspector.get_columns(view, schema)
        except KeyError:
            # For certain types of views, we are unable to fetch the list of columns.
            self.report.report_warning(dataset_name,
                                       "unable to get schema for this view")
            schema_metadata = None
        else:
            schema_fields = self.get_schema_fields(dataset_name, columns)
            schema_metadata = get_schema_metadata(
                self.report,
                dataset_name,
                self.platform,
                columns,
                canonical_schema=schema_fields,
            )
        try:
            # SQLALchemy stubs are incomplete and missing this method.
            # PR: https://github.com/dropbox/sqlalchemy-stubs/pull/223.
            view_info: dict = inspector.get_table_comment(
                view, schema)  # type: ignore
        except NotImplementedError:
            description: Optional[str] = None
            properties: Dict[str, str] = {}
        else:
            description = view_info["text"]

            # The "properties" field is a non-standard addition to SQLAlchemy's interface.
            properties = view_info.get("properties", {})
        try:
            view_definition = inspector.get_view_definition(view, schema)
            if view_definition is None:
                view_definition = ""
            else:
                # Some dialects return a TextClause instead of a raw string,
                # so we need to convert them to a string.
                view_definition = str(view_definition)
        except NotImplementedError:
            view_definition = ""
        properties["view_definition"] = view_definition
        properties["is_view"] = "True"
        dataset_urn = make_dataset_urn_with_platform_instance(
            self.platform,
            dataset_name,
            self.config.platform_instance,
            self.config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[StatusClass(removed=False)],
        )
        db_name = self.get_db_name(inspector)
        yield from self.add_table_to_schema_container(dataset_urn, db_name,
                                                      schema)
        if self.is_stateful_ingestion_configured():
            cur_checkpoint = self.get_current_checkpoint(
                self.get_default_ingestion_job_id())
            if cur_checkpoint is not None:
                checkpoint_state = cast(BaseSQLAlchemyCheckpointState,
                                        cur_checkpoint.state)
                checkpoint_state.add_view_urn(dataset_urn)
        dataset_properties = DatasetPropertiesClass(
            name=view,
            description=description,
            customProperties=properties,
        )
        dataset_snapshot.aspects.append(dataset_properties)
        if schema_metadata:
            dataset_snapshot.aspects.append(schema_metadata)
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = SqlWorkUnit(id=dataset_name, mce=mce)
        self.report.report_workunit(wu)
        yield wu
        dpi_aspect = self.get_dataplatform_instance_aspect(
            dataset_urn=dataset_urn)
        if dpi_aspect:
            yield dpi_aspect
        subtypes_aspect = MetadataWorkUnit(
            id=f"{view}-subtypes",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["view"]),
            ),
        )
        yield subtypes_aspect
        if "view_definition" in properties:
            view_definition_string = properties["view_definition"]
            view_properties_aspect = ViewPropertiesClass(
                materialized=False,
                viewLanguage="SQL",
                viewLogic=view_definition_string)
            yield MetadataWorkUnit(
                id=f"{view}-viewProperties",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="viewProperties",
                    aspect=view_properties_aspect,
                ),
            )

        yield from self._get_domain_wu(
            dataset_name=dataset_name,
            entity_urn=dataset_urn,
            entity_type="dataset",
            sql_config=sql_config,
        )
Ejemplo n.º 13
0
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
    def strip_types(field_path: str) -> str:

        final_path = field_path
        final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path)
        final_path = re.sub(r"^\[version=2.0\]\.", "", final_path)
        return final_path

    datasets: List[DatasetSnapshotClass] = []

    for entity_name, entity_def in entity_registry.items():
        entity_display_name = entity_def.display_name
        entity_fields = []
        for aspect_name in entity_def.aspects:
            if aspect_name not in aspect_registry:
                print(
                    f"Did not find aspect name: {aspect_name} in aspect_registry"
                )
                continue

            # all aspects should have a schema
            aspect_schema = aspect_registry[aspect_name].schema
            assert aspect_schema
            entity_fields.append({
                "type": aspect_schema.to_json(),
                "name": aspect_name,
            })

        if entity_fields:
            names = avro.schema.Names()
            field_objects = []
            for f in entity_fields:
                field = avro.schema.Field(
                    type=f["type"],
                    name=f["name"],
                    has_default=False,
                )
                field_objects.append(field)

            with unittest.mock.patch("avro.schema.Names.add_name", add_name):
                entity_avro_schema = avro.schema.RecordSchema(
                    name=entity_name,
                    namespace="datahub.metadata.model",
                    names=names,
                    fields=[],
                )
                entity_avro_schema.set_prop("fields", field_objects)
            rawSchema = json.dumps(entity_avro_schema.to_json())
            # always add the URN which is the primary key
            urn_field = SchemaField(
                fieldPath="urn",
                type=SchemaFieldDataTypeClass(type=StringTypeClass()),
                nativeDataType="string",
                nullable=False,
                isPartOfKey=True,
                description=
                f"The primary identifier for the {entity_name} entity. See the {entity_def.keyAspect} field to understand the structure of this urn.",
            )
            schema_fields: List[SchemaField] = [
                urn_field
            ] + avro_schema_to_mce_fields(rawSchema)
            foreign_keys: List[ForeignKeyConstraintClass] = []
            source_dataset_urn = make_dataset_urn(
                platform=make_data_platform_urn("datahub"),
                name=f"{entity_display_name}",
            )
            for f_field in schema_fields:
                if f_field.jsonProps:
                    json_dict = json.loads(f_field.jsonProps)
                    if "Aspect" in json_dict:
                        aspect_info = json_dict["Aspect"]
                        f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            tags=[])
                        f_field.globalTags.tags.append(
                            TagAssociationClass(tag="urn:li:tag:Aspect"))
                        # if this is the key aspect, also add primary-key
                        if entity_def.keyAspect == aspect_info.get("name"):
                            f_field.isPartOfKey = True

                        if "timeseries" == aspect_info.get("type", ""):
                            # f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            #    tags=[]
                            # )
                            f_field.globalTags.tags.append(
                                TagAssociationClass(tag="urn:li:tag:Temporal"))
                        import pdb

                        # breakpoint()
                    if "Searchable" in json_dict:
                        f_field.globalTags = f_field.globalTags or GlobalTagsClass(
                            tags=[])
                        f_field.globalTags.tags.append(
                            TagAssociationClass(tag="urn:li:tag:Searchable"))
                    if "Relationship" in json_dict:
                        relationship_info = json_dict["Relationship"]
                        # detect if we have relationship specified at leaf level or thru path specs
                        if "entityTypes" not in relationship_info:
                            # path spec
                            assert (
                                len(relationship_info.keys()) == 1
                            ), "We should never have more than one path spec assigned to a relationship annotation"
                            final_info = None
                            for k, v in relationship_info.items():
                                final_info = v
                            relationship_info = final_info

                        assert "entityTypes" in relationship_info

                        entity_types: List[str] = relationship_info.get(
                            "entityTypes", [])
                        relnship_name = relationship_info.get("name", None)
                        for entity_type in entity_types:
                            destination_entity_name = capitalize_first(
                                entity_type)

                            foreign_dataset_urn = make_dataset_urn(
                                platform=make_data_platform_urn("datahub"),
                                name=destination_entity_name,
                            )
                            fkey = ForeignKeyConstraintClass(
                                name=relnship_name,
                                foreignDataset=foreign_dataset_urn,
                                foreignFields=[
                                    f"urn:li:schemaField:({foreign_dataset_urn}, urn)"
                                ],
                                sourceFields=[
                                    f"urn:li:schemaField:({source_dataset_urn},{f_field.fieldPath})"
                                ],
                            )
                            foreign_keys.append(fkey)
                            relnships_graph.add_edge(
                                entity_display_name,
                                destination_entity_name,
                                fkey.name,
                                f" via `{strip_types(f_field.fieldPath)}`",
                                edge_id=
                                f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}",
                            )

            schemaMetadata = SchemaMetadataClass(
                schemaName=f"{entity_name}",
                platform=make_data_platform_urn("datahub"),
                platformSchema=OtherSchemaClass(rawSchema=rawSchema),
                fields=schema_fields,
                version=0,
                hash="",
                foreignKeys=foreign_keys if foreign_keys else None,
            )

            dataset = DatasetSnapshotClass(
                urn=make_dataset_urn(
                    platform=make_data_platform_urn("datahub"),
                    name=f"{entity_display_name}",
                ),
                aspects=[
                    schemaMetadata,
                    GlobalTagsClass(
                        tags=[TagAssociationClass(tag="urn:li:tag:Entity")]),
                    BrowsePathsClass(
                        [f"/prod/datahub/entities/{entity_display_name}"]),
                ],
            )
            datasets.append(dataset)

    events: List[Union[MetadataChangeEventClass,
                       MetadataChangeProposalWrapper]] = []

    for d in datasets:
        entity_name = d.urn.split(":")[-1].split(",")[1]
        d.aspects.append(
            DatasetPropertiesClass(
                description=make_entity_docs(entity_name, relnships_graph)))

        mce = MetadataChangeEventClass(proposedSnapshot=d, )
        events.append(mce)

        mcp = MetadataChangeProposalWrapper(
            entityType="dataset",
            changeType=ChangeTypeClass.UPSERT,
            entityUrn=d.urn,
            aspectName="subTypes",
            aspect=SubTypesClass(typeNames=["entity"]),
        )
        events.append(mcp)
    return events