コード例 #1
0
ファイル: sql_common.py プロジェクト: swaroopjagadish/datahub
    def _gen_domain_urn(self, dataset_name: str) -> Optional[str]:
        domain_urn: Optional[str] = None

        for domain, pattern in self.config.domain.items():
            if pattern.allowed(dataset_name):
                domain_urn = make_domain_urn(domain)

        return domain_urn
コード例 #2
0
    def _extract_record(self, topic: str,
                        partitioned: bool) -> Iterable[MetadataWorkUnit]:
        logger.info(f"topic = {topic}")

        # 1. Create and emit the default dataset for the topic. Extract type, tenant, namespace
        # and topic name from full Pulsar topic name i.e. persistent://tenant/namespace/topic
        pulsar_topic = PulsarTopic(topic)

        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=pulsar_topic.fullname,
            platform_instance=self.config.platform_instance,
            env=self.config.env,
        )

        status_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-status",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="status",
                aspect=StatusClass(removed=False),
            ),
        )
        self.report.report_workunit(status_wu)
        yield status_wu

        # 2. Emit schemaMetadata aspect
        schema, schema_metadata = self._get_schema_metadata(
            pulsar_topic, platform_urn)
        if schema_metadata is not None:
            schema_metadata_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-schemaMetadata",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="schemaMetadata",
                    aspect=schema_metadata,
                ),
            )
            self.report.report_workunit(schema_metadata_wu)
            yield schema_metadata_wu

        # TODO Add topic properties (Pulsar 2.10.0 feature)
        # 3. Construct and emit dataset properties aspect
        if schema is not None:
            schema_properties = {
                "schema_version": str(schema.schema_version),
                "schema_type": schema.schema_type,
                "partitioned": str(partitioned).lower(),
            }
            # Add some static properties to the schema properties
            schema.properties.update(schema_properties)

            dataset_properties_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-datasetProperties",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="datasetProperties",
                    aspect=DatasetPropertiesClass(
                        description=schema.schema_description,
                        customProperties=schema.properties,
                    ),
                ),
            )
            self.report.report_workunit(dataset_properties_wu)
            yield dataset_properties_wu

        # 4. Emit browsePaths aspect
        pulsar_path = (
            f"{pulsar_topic.tenant}/{pulsar_topic.namespace}/{pulsar_topic.topic}"
        )
        browse_path_suffix = (f"{self.config.platform_instance}/{pulsar_path}"
                              if self.config.platform_instance else
                              pulsar_path)

        browse_path_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-browsePaths",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="browsePaths",
                aspect=BrowsePathsClass([
                    f"/{self.config.env.lower()}/{self.platform}/{browse_path_suffix}"
                ]),
            ),
        )
        self.report.report_workunit(browse_path_wu)
        yield browse_path_wu

        # 5. Emit dataPlatformInstance aspect.
        if self.config.platform_instance:
            platform_instance_wu = MetadataWorkUnit(
                id=f"{dataset_urn}-dataPlatformInstance",
                mcp=MetadataChangeProposalWrapper(
                    entityType="dataset",
                    changeType=ChangeTypeClass.UPSERT,
                    entityUrn=dataset_urn,
                    aspectName="dataPlatformInstance",
                    aspect=DataPlatformInstanceClass(
                        platform=platform_urn,
                        instance=make_dataplatform_instance_urn(
                            self.platform, self.config.platform_instance),
                    ),
                ),
            )
            self.report.report_workunit(platform_instance_wu)
            yield platform_instance_wu

        # 6. Emit subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{dataset_urn}-subTypes",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        # 7. Emit domains aspect
        domain_urn: Optional[str] = None
        for domain, pattern in self.config.domain.items():
            if pattern.allowed(pulsar_topic.fullname):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu
コード例 #3
0
    def _gen_domain_urn(self, dataset_name: str) -> Optional[str]:
        for domain, pattern in self.source_config.domain.items():
            if pattern.allowed(dataset_name):
                return make_domain_urn(domain)

        return None
コード例 #4
0
ファイル: kafka.py プロジェクト: hsheth2/datahub
    def _extract_record(
            self, topic: str) -> Iterable[MetadataWorkUnit]:  # noqa: C901
        logger.debug(f"topic = {topic}")

        # 1. Create the default dataset snapshot for the topic.
        dataset_name = topic
        platform_urn = make_data_platform_urn(self.platform)
        dataset_urn = make_dataset_urn_with_platform_instance(
            platform=self.platform,
            name=dataset_name,
            platform_instance=self.source_config.platform_instance,
            env=self.source_config.env,
        )
        dataset_snapshot = DatasetSnapshot(
            urn=dataset_urn,
            aspects=[Status(removed=False)],  # we append to this list later on
        )

        # 2. Attach schemaMetadata aspect (pass control to SchemaRegistry)
        schema_metadata = self.schema_registry_client.get_schema_metadata(
            topic, platform_urn)
        if schema_metadata is not None:
            dataset_snapshot.aspects.append(schema_metadata)

        # 3. Attach browsePaths aspect
        browse_path_suffix = (f"{self.source_config.platform_instance}/{topic}"
                              if self.source_config.platform_instance else
                              topic)
        browse_path = BrowsePathsClass([
            f"/{self.source_config.env.lower()}/{self.platform}/{browse_path_suffix}"
        ])
        dataset_snapshot.aspects.append(browse_path)

        # 4. Attach dataPlatformInstance aspect.
        if self.source_config.platform_instance:
            dataset_snapshot.aspects.append(
                DataPlatformInstanceClass(
                    platform=platform_urn,
                    instance=make_dataplatform_instance_urn(
                        self.platform, self.source_config.platform_instance),
                ))

        # 5. Emit the datasetSnapshot MCE
        mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
        wu = MetadataWorkUnit(id=f"kafka-{topic}", mce=mce)
        self.report.report_workunit(wu)
        yield wu

        # 5. Add the subtype aspect marking this as a "topic"
        subtype_wu = MetadataWorkUnit(
            id=f"{topic}-subtype",
            mcp=MetadataChangeProposalWrapper(
                entityType="dataset",
                changeType=ChangeTypeClass.UPSERT,
                entityUrn=dataset_urn,
                aspectName="subTypes",
                aspect=SubTypesClass(typeNames=["topic"]),
            ),
        )
        self.report.report_workunit(subtype_wu)
        yield subtype_wu

        domain_urn: Optional[str] = None

        # 6. Emit domains aspect MCPW
        for domain, pattern in self.source_config.domain.items():
            if pattern.allowed(dataset_name):
                domain_urn = make_domain_urn(domain)

        if domain_urn:
            wus = add_domain_to_entity_wu(
                entity_type="dataset",
                entity_urn=dataset_urn,
                domain_urn=domain_urn,
            )
            for wu in wus:
                self.report.report_workunit(wu)
                yield wu