def main(): kafka_url = configs.KAFKA_URL admin_client = AdminClient({"bootstrap.servers": kafka_url}) topics_path = configs.KAFKA_TOPIC_INIT_TOPICS_PATH with open(topics_path) as f: topics_config = json.load(f) topic_list = [] schema_list = [] for tc in topics_config: topic_name = tc["name"] topic_num_partitions = tc["num_partitions"] topic_list.append(NewTopic(topic_name, topic_num_partitions, 1)) if "schema" in tc: schema_list.append(tc) # Create topics try: admin_client.create_topics(topic_list) except kafka.errors.TopicAlreadyExistsError: # topics already exists print("Topics already made") # Register Schemas schema_registry_client = SchemaRegistryClient( {"url": "http://" + configs.KAFKA_SCHEMA_REGISTRY_URL}) for ts in schema_list: name = ts["schema"]["schema"]["title"] shcema_type = ts["schema"]["schemaType"] schema_raw = ts["schema"]["schema"] schema = Schema(json.dumps(schema_raw), "JSON") print("Schema Registered") print(schema_registry_client.register_schema(name, schema))
class KafkaPC: def __init__(self, config_path, config_section): super(KafkaPC, self).__init__() self.in_topic = None self.out_topic = None self.in_schema = None self.out_schema = None self.read_config(config_path, config_section) self.connect_schema_registry() self.read_topics() self.create_topics_on_broker() self.register_schemas_in_registry() self.create_serializer() self.create_deserializer() self.create_consumer() self.create_producer() def connect_schema_registry(self): MAX_RETRIES = 3 if self.config.get("KAFKA_SCHEMA_REGISTRY_URL") is not None: sr_conf = {"url": self.config["KAFKA_SCHEMA_REGISTRY_URL"]} retries = 0 while retries < MAX_RETRIES: try: self.schema_registry = SchemaRegistryClient(sr_conf) print("Connected to Schema Registry") break except Exception as e: retries += 1 print( f"Could not connect to Schema Registry, retry {retries}" ) print({repr(e)}) sleep(5) if retries == MAX_RETRIES: raise ConnectionError("Could not connect to Schema Registry") else: raise ValueError("Need KAFKA_SCHEMA_REGISTRY_URL") def register_schemas_in_registry(self, suffix="-value"): MAX_RETRIES = 3 for topic, schema in self.out_schema.items(): subject = topic + suffix retries = 0 while retries < MAX_RETRIES: try: self.schema_registry.register_schema(subject_name=subject, schema=schema) print(f"Registered schema for topic {topic} in registry") break except Exception as e: retries += 1 print( f"Could not register schema for topic {topic} in registry: {repr(e)}" ) sleep(5) if retries == MAX_RETRIES: raise ConnectionError("Could not connect to Schema Registry") def create_topics_on_broker(self, partitions=1, replication=1): a = AdminClient({"bootstrap.servers": self.config["KAFKA_BROKER_URL"]}) topic_set = set(self.out_topic) md = a.list_topics(timeout=10) broker_set = set(md.topics.values()) diff_set = topic_set.difference(broker_set) new_topics = [ NewTopic(topic, num_partitions=partitions, replication_factor=replication) for topic in diff_set ] fs = a.create_topics(new_topics) # Wait for operation to finish. # Timeouts are preferably controlled by passing request_timeout=15.0 # to the create_topics() call. # All futures will finish at the same time. for topic, f in fs.items(): try: f.result() # The result itself is None print(f"Topic {topic} created on Broker") except Exception as e: print(f"Failed to create topic {topic} on Broker: {repr(e)}") def get_schema_from_registry(self, topic, suffix="-value"): response = None MAX_RETRIES = 3 retries = 0 while retries < MAX_RETRIES: try: schema = self.schema_registry.get_latest_version(topic + suffix) response = schema.schema print(f"Retrieved schema for topic {topic} from Registry") break except Exception as e: retries += 1 print(f"Failed to get schema: {repr(e)}") sleep(3) return response def read_topics(self): if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"): self.in_topic = self.config["IN_TOPIC"] self.in_schema = {} for topic in self.in_topic: # try to get schema from registry schema = self.get_schema_from_registry(topic) # if no schema is found a simple string deserializer will be used, see line 87 if schema is None: self.in_schema[topic] = None else: self.in_schema[topic] = schema if self.config.get("OUT_TOPIC"): self.out_topic = list(self.config["OUT_TOPIC"].keys()) self.out_schema = {} for topic, schema in self.config["OUT_TOPIC"].items(): self.out_schema[topic] = self.read_avro_schema(schema) def create_serializer(self): self.serializer = {} if self.out_topic is not None: for topic in self.out_topic: schema_str = self.out_schema[topic].schema_str self.serializer[topic] = AvroSerializer( schema_str, self.schema_registry) def create_deserializer(self): self.deserializer = {} if self.in_topic is not None: for topic in self.in_topic: if self.in_schema[topic] is None: self.deserializer[topic] = StringDeserializer("utf_8") else: schema_str = self.in_schema[topic].schema_str self.deserializer[topic] = AvroDeserializer( schema_str, self.schema_registry) def create_consumer(self): if self.config.get("IN_TOPIC") and self.config.get("IN_GROUP"): consumer_conf = { "bootstrap.servers": self.config["KAFKA_BROKER_URL"], "group.id": self.config["IN_GROUP"], "auto.offset.reset": "earliest", } self.consumer = Consumer(consumer_conf) self.consumer.subscribe(self.in_topic) def create_producer(self): if self.config.get("OUT_TOPIC"): producer_conf = { "bootstrap.servers": self.config["KAFKA_BROKER_URL"] } self.producer = Producer(producer_conf) def read_config(self, config_path, config_section): self.config = {} if config_path is not None and config_section is not None: config_section = config_section.replace(" ", "").split(",") else: raise ValueError( "Configuration requires config_path and config_section") try: with open(config_path, "r") as ymlfile: config = yaml.load(ymlfile, Loader=yaml.FullLoader) for section in config_section: for key, value in config[section].items(): self.config[key] = value except Exception as e: print(f"Failed to read the config: {repr(e)}") sys.exit() def read_avro_schema(self, schema): with open(schema, "r") as f: schema_str = f.read() avro_schema_str = Schema(schema_str, "AVRO") return avro_schema_str def decode_msg(self, msg): try: topic = msg.topic() value = self.deserializer[topic](msg.value(), None) return value except Exception as e: print(f"Error decoding avro data: {repr(e)}") # sys.exit() def send_msg(self, message, partition=0, topic=None): # if no topic is provided, the first topic in the list is used as default if topic is None: out_topic = self.out_topic[0] else: out_topic = topic # encode the data with the specified Avro out_schema ctx = SerializationContext(out_topic, MessageField.VALUE) ser_message = self.serializer[out_topic](message, ctx) try: self.producer.produce(topic=out_topic, value=ser_message, partition=partition) except Exception as e: print(f"Error sending data to Kafka: {repr(e)}")