def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): super().__init__(ctx) self.source_config = config self.consumer = confluent_kafka.Consumer({ "group.id": "test", "bootstrap.servers": self.source_config.connection.bootstrap, **self.source_config.connection.consumer_config, }) self.schema_registry_client = SchemaRegistryClient( {"url": self.source_config.connection.schema_registry_url}) self.report = KafkaSourceReport()
def schema_registry(self, conf=None): if not hasattr(self._cluster, 'sr'): return None sr_conf = {'url': self._cluster.sr.get('url')} if conf is not None: sr_conf.update(conf) return SchemaRegistryClient(sr_conf)
class KafkaSource(Source): source_config: KafkaSourceConfig consumer: confluent_kafka.Consumer report: KafkaSourceReport def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): super().__init__(ctx) self.source_config = config self.consumer = confluent_kafka.Consumer({ "group.id": "test", "bootstrap.servers": self.source_config.connection.bootstrap, **self.source_config.connection.consumer_config, }) self.schema_registry_client = SchemaRegistryClient({ "url": self.source_config.connection.schema_registry_url, **self.source_config.connection.schema_registry_config, }) self.report = KafkaSourceReport() @classmethod def create(cls, config_dict, ctx): config = KafkaSourceConfig.parse_obj(config_dict) return cls(config, ctx) def get_workunits(self) -> Iterable[MetadataWorkUnit]: topics = self.consumer.list_topics().topics for t in topics: self.report.report_topic_scanned(t) if self.source_config.topic_patterns.allowed(t): mce = self._extract_record(t) wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce) self.report.report_workunit(wu) yield wu else: self.report.report_dropped(t) def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic dataset_snapshot = DatasetSnapshot( urn= f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})", aspects=[], # we append to this list later on ) dataset_snapshot.aspects.append(Status(removed=False)) # Fetch schema from the registry. schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value") schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get value schema: {e}") # Parse the schema fields: List[SchemaField] = [] if schema and schema.schema_type == "AVRO": # "value.id" or "value.[type=string]id" fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {schema.schema_type} is currently not implemented", ) # Fetch key schema from the registry key_schema: Optional[Schema] = None try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-key") key_schema = registered_schema.schema except Exception as e: # do not report warnings because it is okay to not have key schemas logger.debug(f"{topic}: no key schema found. {e}") pass # Parse the key schema key_fields: List[SchemaField] = [] if key_schema and key_schema.schema_type == "AVRO": key_fields = schema_util.avro_schema_to_mce_fields( key_schema.schema_str, is_key_schema=True) elif key_schema is not None: self.report.report_warning( topic, f"Parsing kafka schema type {key_schema.schema_type} is currently not implemented", ) key_schema_str: Optional[str] = None if schema is not None or key_schema is not None: # create a merged string for the combined schemas and compute an md5 hash across schema_as_string = schema.schema_str if schema is not None else "" schema_as_string = (schema_as_string + key_schema.schema_str if key_schema is not None else "") md5_hash = md5(schema_as_string.encode()).hexdigest() if key_schema: key_schema_str = key_schema.schema_str schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=md5_hash, platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema( documentSchema=schema.schema_str if schema is not None else "", keySchema=key_schema_str, ), fields=key_fields + fields, ) dataset_snapshot.aspects.append(schema_metadata) browse_path = BrowsePathsClass( [f"/{self.source_config.env.lower()}/{platform}/{topic}"]) dataset_snapshot.aspects.append(browse_path) metadata_record = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) return metadata_record def get_report(self): return self.report def close(self): if self.consumer: self.consumer.close()
class KafkaSource(Source): source_config: KafkaSourceConfig consumer: confluent_kafka.Consumer report: KafkaSourceReport def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): super().__init__(ctx) self.source_config = config self.consumer = confluent_kafka.Consumer( { "group.id": "test", "bootstrap.servers": self.source_config.connection.bootstrap, **self.source_config.connection.consumer_config, } ) self.schema_registry_client = SchemaRegistryClient( {"url": self.source_config.connection.schema_registry_url} ) self.report = KafkaSourceReport() @classmethod def create(cls, config_dict, ctx): config = KafkaSourceConfig.parse_obj(config_dict) return cls(config, ctx) def get_workunits(self) -> Iterable[MetadataWorkUnit]: topics = self.consumer.list_topics().topics for t in topics: self.report.report_topic_scanned(t) if self.source_config.topic_patterns.allowed(t): mce = self._extract_record(t) wu = MetadataWorkUnit(id=f"kafka-{t}", mce=mce) self.report.report_workunit(wu) yield wu else: self.report.report_dropped(t) def _extract_record(self, topic: str) -> MetadataChangeEvent: logger.debug(f"topic = {topic}") platform = "kafka" dataset_name = topic env = "PROD" # TODO: configure! actor, sys_time = "urn:li:corpuser:etl", int(time.time() * 1000) metadata_record = MetadataChangeEvent() dataset_snapshot = DatasetSnapshot( urn=f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{env})", ) dataset_snapshot.aspects.append(Status(removed=False)) metadata_record.proposedSnapshot = dataset_snapshot # Fetch schema from the registry. has_schema = True try: registered_schema = self.schema_registry_client.get_latest_version( topic + "-value" ) schema = registered_schema.schema except Exception as e: self.report.report_warning(topic, f"failed to get schema: {e}") has_schema = False # Parse the schema fields: List[SchemaField] = [] if has_schema and schema.schema_type == "AVRO": fields = schema_util.avro_schema_to_mce_fields(schema.schema_str) elif has_schema: self.report.report_warning( topic, f"unable to parse kafka schema type {schema.schema_type}" ) if has_schema: schema_metadata = SchemaMetadata( schemaName=topic, version=0, hash=str(schema._hash), platform=f"urn:li:dataPlatform:{platform}", platformSchema=KafkaSchema(documentSchema=schema.schema_str), fields=fields, created=AuditStamp(time=sys_time, actor=actor), lastModified=AuditStamp(time=sys_time, actor=actor), ) dataset_snapshot.aspects.append(schema_metadata) return metadata_record def get_report(self): return self.report def close(self): if self.consumer: self.consumer.close()
def _create_schema_registry_client(self): client = None if self.__schema_registry_conf is not None: client = SchemaRegistryClient(self.__schema_registry_conf) return client
"type": "object", "title": s.registrations_topic + "-value", "properties": { "address": { "type": "string" }, "keyword": { "type": "string" }, "position": { "type": "integer" }, }, } schema_client = SchemaRegistryClient({"url": s.schema_server}) json_serializer = JSONSerializer(dumps(message_schema), schema_client, conf={"auto.register.schemas": False}) string_serializer = StringSerializer() key_context = SerializationContext(s.registrations_topic, MessageField.KEY) value_context = SerializationContext(s.registrations_topic, MessageField.VALUE) con = psycopg2.connect( database=s.db_database, user=s.db_user, password=s.db_password, host=s.db_server, port=s.db_port,
def schema_registry(self, conf=None): if self._sr_url is None: raise RuntimeError("No Schema-registry available in Byo cluster") return SchemaRegistryClient({"url": self._sr_url})