Example #1
0
 def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
     super().__init__(ctx)
     self.source_config = config
     self.consumer = confluent_kafka.Consumer({
         "group.id":
         "test",
         "bootstrap.servers":
         self.source_config.connection.bootstrap,
         **self.source_config.connection.consumer_config,
     })
     self.schema_registry_client = SchemaRegistryClient(
         {"url": self.source_config.connection.schema_registry_url})
     self.report = KafkaSourceReport()
    def schema_registry(self, conf=None):
        if not hasattr(self._cluster, 'sr'):
            return None

        sr_conf = {'url': self._cluster.sr.get('url')}
        if conf is not None:
            sr_conf.update(conf)
        return SchemaRegistryClient(sr_conf)
Example #3
0
class KafkaSource(Source):
    source_config: KafkaSourceConfig
    consumer: confluent_kafka.Consumer
    report: KafkaSourceReport

    def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
        super().__init__(ctx)
        self.source_config = config
        self.consumer = confluent_kafka.Consumer({
            "group.id":
            "test",
            "bootstrap.servers":
            self.source_config.connection.bootstrap,
            **self.source_config.connection.consumer_config,
        })
        self.schema_registry_client = SchemaRegistryClient({
            "url":
            self.source_config.connection.schema_registry_url,
            **self.source_config.connection.schema_registry_config,
        })
        self.report = KafkaSourceReport()

    @classmethod
    def create(cls, config_dict, ctx):
        config = KafkaSourceConfig.parse_obj(config_dict)
        return cls(config, ctx)

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        topics = self.consumer.list_topics().topics
        for t in topics:
            self.report.report_topic_scanned(t)

            if self.source_config.topic_patterns.allowed(t):
                mce = self._extract_record(t)
                wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce)
                self.report.report_workunit(wu)
                yield wu
            else:
                self.report.report_dropped(t)

    def _extract_record(self, topic: str) -> MetadataChangeEvent:
        logger.debug(f"topic = {topic}")
        platform = "kafka"
        dataset_name = topic

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        # Fetch schema from the registry.
        schema: Optional[Schema] = None
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-value")
            schema = registered_schema.schema
        except Exception as e:
            self.report.report_warning(topic,
                                       f"failed to get value schema: {e}")

        # Parse the schema
        fields: List[SchemaField] = []
        if schema and schema.schema_type == "AVRO":
            # "value.id" or "value.[type=string]id"
            fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
        elif schema is not None:
            self.report.report_warning(
                topic,
                f"Parsing kafka schema type {schema.schema_type} is currently not implemented",
            )
        # Fetch key schema from the registry
        key_schema: Optional[Schema] = None
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-key")
            key_schema = registered_schema.schema
        except Exception as e:
            # do not report warnings because it is okay to not have key schemas
            logger.debug(f"{topic}: no key schema found. {e}")
            pass

        # Parse the key schema
        key_fields: List[SchemaField] = []
        if key_schema and key_schema.schema_type == "AVRO":
            key_fields = schema_util.avro_schema_to_mce_fields(
                key_schema.schema_str, is_key_schema=True)
        elif key_schema is not None:
            self.report.report_warning(
                topic,
                f"Parsing kafka schema type {key_schema.schema_type} is currently not implemented",
            )

        key_schema_str: Optional[str] = None
        if schema is not None or key_schema is not None:
            # create a merged string for the combined schemas and compute an md5 hash across
            schema_as_string = schema.schema_str if schema is not None else ""
            schema_as_string = (schema_as_string + key_schema.schema_str
                                if key_schema is not None else "")
            md5_hash = md5(schema_as_string.encode()).hexdigest()

            if key_schema:
                key_schema_str = key_schema.schema_str

            schema_metadata = SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=md5_hash,
                platform=f"urn:li:dataPlatform:{platform}",
                platformSchema=KafkaSchema(
                    documentSchema=schema.schema_str
                    if schema is not None else "",
                    keySchema=key_schema_str,
                ),
                fields=key_fields + fields,
            )
            dataset_snapshot.aspects.append(schema_metadata)

        browse_path = BrowsePathsClass(
            [f"/{self.source_config.env.lower()}/{platform}/{topic}"])
        dataset_snapshot.aspects.append(browse_path)

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record

    def get_report(self):
        return self.report

    def close(self):
        if self.consumer:
            self.consumer.close()
Example #4
0
class KafkaSource(Source):
    source_config: KafkaSourceConfig
    consumer: confluent_kafka.Consumer
    report: KafkaSourceReport

    def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext):
        super().__init__(ctx)
        self.source_config = config
        self.consumer = confluent_kafka.Consumer(
            {
                "group.id": "test",
                "bootstrap.servers": self.source_config.connection.bootstrap,
                **self.source_config.connection.consumer_config,
            }
        )
        self.schema_registry_client = SchemaRegistryClient(
            {"url": self.source_config.connection.schema_registry_url}
        )
        self.report = KafkaSourceReport()

    @classmethod
    def create(cls, config_dict, ctx):
        config = KafkaSourceConfig.parse_obj(config_dict)
        return cls(config, ctx)

    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
        topics = self.consumer.list_topics().topics
        for t in topics:
            self.report.report_topic_scanned(t)

            if self.source_config.topic_patterns.allowed(t):
                mce = self._extract_record(t)
                wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce)
                self.report.report_workunit(wu)
                yield wu
            else:
                self.report.report_dropped(t)

    def _extract_record(self, topic: str) -> MetadataChangeEvent:
        logger.debug(f"topic = {topic}")
        platform = "kafka"
        dataset_name = topic
        env = "PROD"  # TODO: configure!
        actor, sys_time = "urn:li:corpuser:etl", int(time.time() * 1000)

        metadata_record = MetadataChangeEvent()
        dataset_snapshot = DatasetSnapshot(
            urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})",
        )
        dataset_snapshot.aspects.append(Status(removed=False))
        metadata_record.proposedSnapshot = dataset_snapshot

        # Fetch schema from the registry.
        has_schema = True
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-value"
            )
            schema = registered_schema.schema
        except Exception as e:
            self.report.report_warning(topic, f"failed to get schema: {e}")
            has_schema = False

        # Parse the schema
        fields: List[SchemaField] = []
        if has_schema and schema.schema_type == "AVRO":
            fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
        elif has_schema:
            self.report.report_warning(
                topic, f"unable to parse kafka schema type {schema.schema_type}"
            )

        if has_schema:
            schema_metadata = SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=str(schema._hash),
                platform=f"urn:li:dataPlatform:{platform}",
                platformSchema=KafkaSchema(documentSchema=schema.schema_str),
                fields=fields,
                created=AuditStamp(time=sys_time, actor=actor),
                lastModified=AuditStamp(time=sys_time, actor=actor),
            )
            dataset_snapshot.aspects.append(schema_metadata)

        return metadata_record

    def get_report(self):
        return self.report

    def close(self):
        if self.consumer:
            self.consumer.close()
 def _create_schema_registry_client(self):
     client = None
     if self.__schema_registry_conf is not None:
         client = SchemaRegistryClient(self.__schema_registry_conf)
     return client
    "type": "object",
    "title": s.registrations_topic + "-value",
    "properties": {
        "address": {
            "type": "string"
        },
        "keyword": {
            "type": "string"
        },
        "position": {
            "type": "integer"
        },
    },
}

schema_client = SchemaRegistryClient({"url": s.schema_server})

json_serializer = JSONSerializer(dumps(message_schema),
                                 schema_client,
                                 conf={"auto.register.schemas": False})
string_serializer = StringSerializer()

key_context = SerializationContext(s.registrations_topic, MessageField.KEY)
value_context = SerializationContext(s.registrations_topic, MessageField.VALUE)

con = psycopg2.connect(
    database=s.db_database,
    user=s.db_user,
    password=s.db_password,
    host=s.db_server,
    port=s.db_port,
 def schema_registry(self, conf=None):
     if self._sr_url is None:
         raise RuntimeError("No Schema-registry available in Byo cluster")
     return SchemaRegistryClient({"url": self._sr_url})