Example #1
0
    def _get_schema_metadata(
        self, pulsar_topic: PulsarTopic, platform_urn: str
    ) -> Tuple[Optional[PulsarSchema], Optional[SchemaMetadata]]:

        schema, fields = self._get_schema_and_fields(
            pulsar_topic=pulsar_topic, is_key_schema=False
        )  # type: Tuple[Optional[PulsarSchema], List[SchemaField]]

        # Create the schemaMetadata aspect.
        if schema is not None:
            md5_hash = md5(schema.schema_str.encode()).hexdigest()

            return schema, SchemaMetadata(
                schemaName=schema.schema_name,
                version=schema.schema_version,
                hash=md5_hash,
                platform=platform_urn,
                platformSchema=KafkaSchema(
                    documentSchema=schema.schema_str
                    if schema is not None else "",
                    keySchema=None,
                ),
                fields=fields,
            )
        return None, None
    def get_schema_metadata(self, topic: str,
                            platform_urn: str) -> Optional[SchemaMetadata]:
        logger.debug(f"Inside _get_schema_metadata {topic} {platform_urn}")
        # Process the value schema
        schema, fields = self._get_schema_and_fields(
            topic=topic, is_key_schema=False
        )  # type: Tuple[Optional[Schema], List[SchemaField]]

        # Process the key schema
        key_schema, key_fields = self._get_schema_and_fields(
            topic=topic, is_key_schema=True
        )  # type:Tuple[Optional[Schema], List[SchemaField]]

        # Create the schemaMetadata aspect.
        if schema is not None or key_schema is not None:
            # create a merged string for the combined schemas and compute an md5 hash across
            schema_as_string = (schema.schema_str if schema is not None else
                                "") + (key_schema.schema_str
                                       if key_schema is not None else "")
            md5_hash = md5(schema_as_string.encode()).hexdigest()

            return SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=md5_hash,
                platform=platform_urn,
                platformSchema=KafkaSchema(
                    documentSchema=schema.schema_str
                    if schema is not None else "",
                    keySchema=key_schema.schema_str if key_schema else None,
                ),
                fields=key_fields + fields,
            )
        return None
Example #3
0
    def _extract_record(self, topic: str) -> MetadataChangeEvent:
        logger.debug(f"topic = {topic}")
        platform = "kafka"
        dataset_name = topic
        actor = "urn:li:corpuser:etl"
        sys_time = get_sys_time()

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))

        # Fetch schema from the registry.
        has_schema = True
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-value")
            schema = registered_schema.schema
        except Exception as e:
            self.report.report_warning(topic, f"failed to get schema: {e}")
            has_schema = False

        # Parse the schema
        fields: List[SchemaField] = []
        if has_schema and schema.schema_type == "AVRO":
            fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
        elif has_schema:
            self.report.report_warning(
                topic,
                f"unable to parse kafka schema type {schema.schema_type}")

        if has_schema:
            schema_metadata = SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=str(schema._hash),
                platform=f"urn:li:dataPlatform:{platform}",
                platformSchema=KafkaSchema(documentSchema=schema.schema_str),
                fields=fields,
                created=AuditStamp(time=sys_time, actor=actor),
                lastModified=AuditStamp(time=sys_time, actor=actor),
            )
            dataset_snapshot.aspects.append(schema_metadata)

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record
Example #4
0
    def _extract_record(self, topic: str) -> MetadataChangeEvent:
        logger.debug(f"topic = {topic}")
        platform = "kafka"
        dataset_name = topic

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        # Fetch schema from the registry.
        schema: Optional[Schema] = None
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-value")
            schema = registered_schema.schema
        except Exception as e:
            self.report.report_warning(topic,
                                       f"failed to get value schema: {e}")

        # Parse the schema
        fields: List[SchemaField] = []
        if schema and schema.schema_type == "AVRO":
            # "value.id" or "value.[type=string]id"
            fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
        elif schema is not None:
            self.report.report_warning(
                topic,
                f"Parsing kafka schema type {schema.schema_type} is currently not implemented",
            )
        # Fetch key schema from the registry
        key_schema: Optional[Schema] = None
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-key")
            key_schema = registered_schema.schema
        except Exception as e:
            # do not report warnings because it is okay to not have key schemas
            logger.debug(f"{topic}: no key schema found. {e}")
            pass

        # Parse the key schema
        key_fields: List[SchemaField] = []
        if key_schema and key_schema.schema_type == "AVRO":
            key_fields = schema_util.avro_schema_to_mce_fields(
                key_schema.schema_str, is_key_schema=True)
        elif key_schema is not None:
            self.report.report_warning(
                topic,
                f"Parsing kafka schema type {key_schema.schema_type} is currently not implemented",
            )

        key_schema_str: Optional[str] = None
        if schema is not None or key_schema is not None:
            # create a merged string for the combined schemas and compute an md5 hash across
            schema_as_string = schema.schema_str if schema is not None else ""
            schema_as_string = (schema_as_string + key_schema.schema_str
                                if key_schema is not None else "")
            md5_hash = md5(schema_as_string.encode()).hexdigest()

            if key_schema:
                key_schema_str = key_schema.schema_str

            schema_metadata = SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=md5_hash,
                platform=f"urn:li:dataPlatform:{platform}",
                platformSchema=KafkaSchema(
                    documentSchema=schema.schema_str
                    if schema is not None else "",
                    keySchema=key_schema_str,
                ),
                fields=key_fields + fields,
            )
            dataset_snapshot.aspects.append(schema_metadata)

        browse_path = BrowsePathsClass(
            [f"/{self.source_config.env.lower()}/{platform}/{topic}"])
        dataset_snapshot.aspects.append(browse_path)

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record