def loadavsc(self, avscid): # global self.avscmap self.__logger.debug("In loadavsc with avscid: %s" % avscid) avsc = None self.__logger.debug( "lib_pmgrpcd.OPTIONS.urlscreg: %s lib_pmgrpcd.OPTIONS.calocation: %s" % (lib_pmgrpcd.OPTIONS.urlscreg, lib_pmgrpcd.OPTIONS.calocation)) try: self.__logger.debug( "Instancing client (CachedSchemaRegistryClient) with avscid:%s url:%s ssl.ca.location:%s", avscid, lib_pmgrpcd.OPTIONS.urlscreg, lib_pmgrpcd.OPTIONS.calocation, ) client = CachedSchemaRegistryClient( url=lib_pmgrpcd.OPTIONS.urlscreg, ca_location=lib_pmgrpcd.OPTIONS.calocation, ) except Exception as e: self.__logger.info( "ERROR: load avro schema from schema-registry-server is failed on CachedSchemaRegistryClient on using method get_by_id()" ) self.__logger.info("ERROR: %s" % (e)) return avsc try: avsc = client.get_by_id(avscid) except Exception as e: self.__logger.info( "ERROR: load avro schema from schema-registry-server is failed on CachedSchemaRegistryClient on using method get_by_id()" ) self.__logger.info("ERROR: %s" % (e)) return avsc try: avsc_dict = json.loads(str(avsc)) except Exception as e: self.__logger.info( "ERROR: json.loads of the avsc_str is faild to produce a dict") self.__logger.info("ERROR: %s" % (e)) return avsc self.__logger.info("SCHEMA_OF_ID(%s): %s" % (avscid, avsc_dict["name"])) # Query Schema-Registry # self.jsonmap = json.load(mapfile) if avscid in self.avscmap: self.__logger.debug( "Update self.avscmap the existing record avscid (%s) with avroschema" % avscid) self.avscmap[avscid].update({"avsc": avsc_dict}) else: self.__logger.debug( "Update self.avscmap with new record avscid (%s) with avroschema" % avscid) self.avscmap.update({avscid: {"avsc": avsc_dict}}) return avsc
def loadavsc(avscid): global avscmap global options serializelog.debug("In loadavsc with avscid: %s" % avscid) avsc = None serializelog.debug("options.urlscreg: %s options.calocation: %s" % (options.urlscreg, options.calocation)) try: serializelog.debug("querying screg with avscid: %s" % (avscid)) client = CachedSchemaRegistryClient({'url':options.urlscreg, 'ssl.ca.location':options.calocation}) avsc = client.get_by_id(avscid) except Exception as e: serializelog.info("ERROR: load avro schema from schema-registry-server is failed on CachedSchemaRegistryClient on using method get_by_id()") serializelog.info("ERROR: %s" % (e)) try: avsc_dict = json.loads(str(avsc)) except Exception as e: serializelog.info("ERROR: json.loads of the avsc_str is faild to produce a dict") serializelog.info("ERROR: %s" % (e)) serializelog.info("SCHEMA_OF_ID(%s): %s" % (avscid, avsc_dict["name"])) #Query Schema-Registry #jsonmap = json.load(mapfile) if avscid in avscmap: serializelog.debug("Update avscmap the existing record avscid (%s) with avroschema" % avscid) avscmap[avscid].update({"avsc": avsc_dict}) else: serializelog.debug("Update avscmap with new record avscid (%s) with avroschema" % avscid) avscmap.update({avscid:{"avsc": avsc_dict}}) return avsc
class _AvroIORegistry: def __init__(self, schema_registry_url): """Private implementation class for Avro IO using the registry""" log.info( f"Using registry with schema_url/id {schema_registry_url}/{config.SCHEMA_ID}" ) try: self.client = CachedSchemaRegistryClient(url=schema_registry_url) self.schema = self.client.get_by_id(config.SCHEMA_ID) self.serializer = MessageSerializer(self.client) except: raise ValueError("Client id or schema id not found") def decode(self, bytes): return self.serializer.decode_message(bytes) def encode(self, record): return self.serializer.encode_record_with_schema_id( config.SCHEMA_ID, record)
class LocationKafkaListerner(object): __instance = None @staticmethod def create(app): if LocationKafkaListerner.__instance is None: LocationKafkaListerner(app) else: return LocationKafkaListerner.__instance def __init__(self, app): print("2") LocationKafkaListerner.__instance = self self.app = app self.config = Config.getInstance() self.register_client = CachedSchemaRegistryClient( url=app.config['KAFKA_SCHEMA_REGISTRY_URL'] ) print("3") self.client = self.config.getESClient() threading.Thread(target=self.readJobsData).start() threading.Thread(target=self.readMappingsData).start() threading.Thread(target=self.readSubmissionsData).start() def readJobsData(self): print("4") kafkaConsumer = self.config.getKafkaConsumer( 'locationsearch_job_entity' + str(uuid.uuid1())) kafkaConsumer.subscribe(['job_entity']) self.get_data(kafkaConsumer) def readMappingsData(self): kafkaConsumer = self.config.getKafkaConsumer( 'locationsearch_jobCandidateMapping') kafkaConsumer.subscribe(['jobcandidatemapping_entity']) self.get_data(kafkaConsumer) def readSubmissionsData(self): kafkaConsumer = self.config.getKafkaConsumer( 'locationsearch_jobCandidateInteraction') kafkaConsumer.subscribe(['jobcandidateinteraction_entity']) self.get_data(kafkaConsumer) def get_data(self, consumer): print("5"); while True: try: print("getting message") msg = consumer.poll(10) print("after getting a message") except SerializerError as e: print("Message deserialization failed for {}: {}".format(msg, e)) raise SerializerError except: print('An error occurred.') print(msg) if msg: if msg.error(): print("AvroConsumer error: {}".format(msg.error())) return self.parseLocation(unpack(msg.value())) else: print("No Message!!") def parseLocation(self, message): entity = json.loads(json.dumps(message)) if(entity['locations']): # Message from job index for l in entity['locations']: self.buildLocationLookupEntity(l) else: # Message from candidate index self.buildLocationLookupEntity(message.currentLocation) self.buildLocationLookupEntity(message.preferredLocations) def buildLocationLookupEntity(self, location): latlng = location['point'].split(",") if(location['point'] == '' or location['point'] is None or (float(latlng[0]) == 0 and float(latlng[1]) == 0)): print('no location latlng = {}'.format(latlng)) else: addressComponents = [] locationLookup = {} if location['city']: addressComponents.append(location['city']) if location['state']: addressComponents.append(location['state']) elif location['stateCode']: addressComponents.append(location['stateCode']) if location['country']: addressComponents.append(location['country']) elif location['countryCode']: addressComponents.append(location['countryCode']) if(addressComponents.count == 0): if location['continent']: addressComponents.append(location['continent']) elif location['continentCode']: addressComponents.append(location['continentCode']) locationLookup['id'] = str(uuid.uuid4()) locationLookup['keywords'] = ", ".join(addressComponents) locationLookup['city'] = location['city'] locationLookup['state'] = location['state'] locationLookup['stateCode'] = location['stateCode'] locationLookup['country'] = location['country'] locationLookup['countryCode'] = location['countryCode'] locationLookup['continent'] = location['continent'] locationLookup['continentCode'] = location['continentCode'] locationLookup['zipCode'] = location['zipCode'] response = self.client.search( index="location_lookup", body={ "size": 1, "query": { "term": { "keywords.lowercase": locationLookup['keywords'].lower() } } } ) if(response['hits']['total'] == 0): print("Indexing " + locationLookup['keywords']) print(self.client.index( index='location_lookup', doc_type='location_lookup', id=locationLookup['id'], refresh='wait_for', body=locationLookup) ) else: print("Ignoring" + locationLookup['keywords']) def unpack(self, payload): MAGIC_BYTES = 0 magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) abc = reader.read(output) return abc # String key else: # Timestamp is inside my key return payload[:-8].decode()
class CatalogExporter(VtvTask): def __init__(self): VtvTask.__init__(self) self.catalogId = self.options.cust_id self.config = yaml.load(open(self.options.config_file, 'r'), Loader=yaml.FullLoader)[self.catalogId] self.consumer = self.init_consumer() self.out_dir = self.config['out_dir'] make_dir(self.out_dir) self.s3Location = self.config['s3Location'] def init_consumer(self): bootstrap_server = self.config['bootstrap-server'] schema_url = self.config['schema-registery-url'] # KAFKA BROKER URL consumer = Consumer({ 'bootstrap.servers': bootstrap_server, 'group.id': 'catalog-export-%s' % self.catalogId, 'auto.offset.reset': 'earliest' }) # SCHEMA URL self.register_client = CachedSchemaRegistryClient(url=schema_url) consumer.subscribe(['catserver-%s-catalog' % self.catalogId], on_assign=self.my_on_assign) return consumer def my_on_assign(self, consumer, partitions): for p in partitions: # some starting offset, or use OFFSET_BEGINNING, et, al. # the default offset is STORED which means use committed offsets, and if # no committed offsets are available use auto.offset.reset config (default latest) p.offset = 0 # call assign() to start fetching the given partitions. consumer.assign(partitions) def decode_avro(self, payload): magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) decoded = reader.read(output) return decoded, schema.name # no magic bytes, something is wrong else: raise ValueError def generate_catalog(self): output_file = os.path.join( self.out_dir, "catalog-%s-%s.gz" % (self.catalogId, round(time.time() * 1000))) output_file_fp = gzip.open(output_file, "wt") catalogTopicPartition = TopicPartition( "catserver-%s-catalog" % self.catalogId, 0, 0) lastMsgToRead = self.consumer.get_watermark_offsets( catalogTopicPartition)[1] - 1 current_offset = 0 print(lastMsgToRead) record_list = [] #while current_offset < lastMsgToRead: #count = lastMsgToRead first_pass = True cnt = 0 rec = 0 #while count > lastMsgToRead - 10 && count <= lastMsgToRead: while current_offset < 30: try: msg_list = self.consumer.consume(5, 100) except SerializerError as e: print("Message deserialization failed for {}: {}".format( msg_list, e)) raise SerializerError for msg in msg_list: if msg.error(): print("AvroConsumer error: {}".format(msg.error())) return msg_id = msg.key() print(msg.offset(), msg_id) if first_pass: ids_dict[msg_id] = msg.offset() else: if msg.value() is None: message = None content_type = "PROGRAM" else: message, content_type = self.decode_avro(msg.value()) if ids_dict[msg_id] == msg.offset(): rec = rec + 1 record_list.append( OrderedDict([("id", msg_id.decode("utf-8")), ("type", content_type), ("offset", msg.offset()), ("content", message)])) if len(record_list) >= 10000: output_file_fp.write('\n'.join( json.dumps(record) for record in record_list) + "\n") del record_list[:] #count = msg.offset() current_offset = msg.offset() cnt = cnt + 1 self.logger.info( "Continuing to the processes. Currently at offset {}/{}". format(current_offset, lastMsgToRead)) if first_pass and current_offset == lastMsgToRead: first_pass = False current_offset = 0 catalogTopicPartition = TopicPartition( "catserver-%s-catalog" % self.catalogId, 0, 0) self.consumer.seek(catalogTopicPartition) self.consumer.close() if len(record_list) > 0: output_file_fp.write('\n'.join( json.dumps(record) for record in record_list)) output_file_fp.close() print( s3_utils.upload_file_to_s3(output_file, self.s3Location, self.logger)) print("size in Bytes: %d" % sys.getsizeof(ids_dict)) print("unique records: %d" % len(ids_dict)) print("Last msg offset: %d" % lastMsgToRead) print("No of records: %d" % cnt) print("No of written records: %d" % rec) def run_main(self): self.generate_catalog() def set_options(self): config_file = os.path.join(self.system_dirs.VTV_ETC_DIR, 'exporter_cfg.yaml') self.parser.add_option('-c', '--config-file', default=config_file, help='configuration file') self.parser.add_option('-t', '--cust-id', help="name of the customer") def cleanup(self): self.move_logs(self.out_dir, [('.', '*log')])
class CatalogExporter(VtvTask): def __init__(self): VtvTask.__init__(self) self.catalogId = self.options.cust_id print(self.catalogId) self.config = yaml.load(open(self.options.config_file, 'r'), Loader=yaml.FullLoader)[self.catalogId] self.consumer = self.init_consumer() self.out_dir = self.config['out_dir'] make_dir(self.out_dir) self.s3Location = self.config['s3Location'] def init_consumer(self): bootstrap_server = self.config['bootstrap-server'] schema_url = self.config['schema-registery-url'] # KAFKA BROKER URL consumer = Consumer({ 'bootstrap.servers': bootstrap_server, 'group.id': 'catalog-export-%s' %self.catalogId, 'auto.offset.reset': 'earliest' }) # SCHEMA URL self.register_client = CachedSchemaRegistryClient(url=schema_url) consumer.subscribe(['catserver-%s-catalog' % self.catalogId], on_assign=self.my_on_assign) return consumer def my_on_assign(self, consumer, partitions): for p in partitions: # some starting offset, or use OFFSET_BEGINNING, et, al. # the default offset is STORED which means use committed offsets, and if # no committed offsets are available use auto.offset.reset config (default latest) p.offset = 0 # call assign() to start fetching the given partitions. consumer.assign(partitions) def decode_avro(self, payload): magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) decoded = reader.read(output) return decoded, schema.name # no magic bytes, something is wrong else: raise ValueError def generate_catalog(self): #output_file = os.path.join(self.out_dir, "catalog-%s-%s.gz" % (self.catalogId, round(time.time() * 1000))) output_file = os.path.join(self.out_dir, "catalog-%s-%s.gz" % (self.catalogId, TIME_STAMP)) output_file_fp = OUTFILE_MAP[output_file] available_ids_file = os.path.join(self.out_dir, AVAILABLE_IDS_FILE) catalogTopicPartition = TopicPartition("catserver-%s-catalog" % self.catalogId, 0, 0) lastMsgToRead = self.consumer.get_watermark_offsets(catalogTopicPartition)[1] - 1 current_offset = 0 print(lastMsgToRead) record_list = [] ps = self.consumer.list_topics("catserver-%s-catalog" % self.catalogId) print(ps.topics) cnt, rec, offer_content = 0, 0, 0 batch_size = 500 first_pass = True while current_offset < lastMsgToRead: try: msg_list = self.consumer.consume(batch_size, 100) except SerializerError as e: print("Message deserialization failed for {}: {}".format(msg_list, e)) raise SerializerError for msg in msg_list: if msg.error(): print("AvroConsumer error: {}".format(msg.error())) return if msg.key() is None: print("Key is None for offset {}".format(msg.offset())) msg_id = msg.key().decode("utf-8") if first_pass: existing_offset = UNIQUE_IDS.get(msg_id) if existing_offset is None: if msg.value() is not None: UNIQUE_IDS[msg_id] = msg.offset() #message, content_type = self.decode_avro(msg.value()) #if content_type in ("VodOffer", "LinearBlock"): # self.parse_availability_first(content_type, message) self.logger.info("msg_id: {}, msg_offset: {} added for the first time in dict".format(msg_id, msg.offset())) elif msg.value() is None: del UNIQUE_IDS[msg_id] self.logger.info("Content is null for msg_id: {}, msg_offset: {}, existing offset: {}. Remove from dict".format(msg_id, msg.offset(), existing_offset)) else: UNIQUE_IDS[msg_id] = msg.offset() #message, content_type = self.decode_avro(msg.value()) #if content_type in ("VodOffer", "LinearBlock"): # self.parse_availability_first(content_type, message) else: existing_offset = UNIQUE_IDS.get(msg_id) if existing_offset is None: self.logger.info("msg_id: {}, msg_offset: {} not present, msg.value(): {}".format(msg_id, msg.offset(), msg.value())) elif existing_offset == msg.offset(): message, content_type = self.decode_avro(msg.value()) if content_type in ("VodOffer", "LinearBlock"): offer_content = offer_content + 1 self.parse_availability(available_ids_file, content_type, message) else: rec = rec + 1 record_list.append(OrderedDict([("id", msg_id), ("type", content_type), ("offset", msg.offset()), ("content", message)])) if len(record_list) >= 10000: output_file_fp.write('\n'.join(json.dumps(record) for record in record_list) + "\n") del record_list[:] current_offset = msg.offset() cnt = cnt + 1 self.logger.info("Continuing to the processes. Currently at offset {}/{}".format(current_offset, lastMsgToRead)) if first_pass and current_offset == lastMsgToRead: first_pass = False current_offset = 0 print("No of records: %d first pass" % cnt) cnt = 0 self.consumer.seek(catalogTopicPartition) self.consumer.close() if len(record_list) > 0: output_file_fp.write('\n'.join(json.dumps(record) for record in record_list) + "\n") ''' output_file_fp.close() available_ids_file = os.path.join(self.out_dir, AVAILABLE_IDS_FILE) print(os.path.exists(available_ids_file)) if os.stat(output_file).st_size > 0 and os.path.exists(available_ids_file): s3_upload_status = s3_utils.upload_file_to_s3(output_file, self.s3Location, self.logger) available_ids_file_upload = s3_utils.upload_file_to_s3(AVAILABLE_IDS_FILE, self.s3Location, self.logger) print("catalogue upload: %d, available_ids_file_upload: %d" % (s3_upload_status, available_ids_file_upload)) self.logger.info("catalogue upload: {}, available_ids_file_upload: {}".format(s3_upload_status, available_ids_file_upload)) ''' print("size in Bytes: %d" % sys.getsizeof(UNIQUE_IDS)) print("unique records: %d" % len(UNIQUE_IDS)) print("Last msg offset: %d" % lastMsgToRead) print("No of records: %d" % cnt) print("No of written records: %d" % rec) print("No of offer content records: %d" % offer_content) print("No of available IDs: %d" % len(AVAILABLE_IDS)) def parse_availability(self, filename, content_type, content_info): #filename = os.path.join(self.out_dir, AVAILABLE_IDS_FILE) fp = OUTFILE_MAP.get(filename) #if not fp: # fp = open(filename, 'w') # OUTFILE_MAP[filename] = fp if not content_info: self.logger.info("Content Empty for ID: %s" % self.sk) return if not content_info: self.logger.info("Content Empty for Record: %s" % json_record) return if content_type == "LinearBlock": offers = self.get_value(content_info, "offers", []) for offer in offers: work_id = self.get_value(offer, "workId", "") series_id = self.get_value(offer, "seriesId", "") end_time = self.get_value(offer, "endTime", "") if end_time: end_date = datetime.datetime.strptime(end_time.split('T')[0], '%Y-%m-%d').date() else: end_date = DATE_TODAY if work_id and end_date >= DATE_TODAY: fp.write('%s\n' % work_id) AVAILABLE_IDS.add(work_id) if series_id and end_date >= DATE_TODAY: fp.write('%s\n' % series_id) AVAILABLE_IDS.add(series_id) else: work_id = self.get_value(content_info, "workId", "") series_id = self.get_value(content_info, "seriesId", "") end_time = self.get_value(content_info, "endTime", "") if end_time: end_date = datetime.datetime.strptime(end_time.split('T')[0], '%Y-%m-%d').date() else: end_date = DATE_TODAY if work_id and end_date >= DATE_TODAY: fp.write('%s\n' % work_id) AVAILABLE_IDS.add(work_id) if series_id and end_date >= DATE_TODAY: fp.write('%s\n' % series_id) AVAILABLE_IDS.add(series_id) def get_value(self, d, key, default): value = d.get(key) if value == None: value = default return value def create_files(self): try: catalogue_output_file = os.path.join(self.out_dir, "catalog-%s-%s.gz" % (self.catalogId, TIME_STAMP)) catalogue_output_file_p = gzip.open(catalogue_output_file, "wt") OUTFILE_MAP[catalogue_output_file] = catalogue_output_file_p except Exception as e: print(e) return -1 try: available_ids_file = os.path.join(self.out_dir, AVAILABLE_IDS_FILE) available_ids_file_p = open(available_ids_file, 'w') OUTFILE_MAP[available_ids_file] = available_ids_file_p except Exception as e: print(e) return -1 return 0 def close(self): for outf in OUTFILE_MAP.values(): outf.close() def upload(self): for key in OUTFILE_MAP: s3_upload_status = s3_utils.upload_file_to_s3(key, self.s3Location, self.logger) print("s3_upload_status: %d" % (s3_upload_status)) self.logger.info("s3 upload status: {}".format(s3_upload_status)) def run_main(self): if(self.create_files() == 0): self.generate_catalog() self.close() self.upload() def set_options(self): config_file = os.path.join(self.system_dirs.VTV_ETC_DIR, 'exporter_cfg.yaml') self.parser.add_option('-c', '--config-file', default=config_file, help='configuration file') self.parser.add_option('-t', '--cust-id', help="name of the customer") def cleanup(self): self.move_logs(self.out_dir, [('.', '*log')])
class KafkaAvroProcessor(object): def __init__(self, kafka_conf): self.topics = None self.register_client = None self.consumer_conf = { 'auto.offset.reset': 'earliest', 'enable.partition.eof': True, } self.consumer_conf.update(kafka_conf) # self.producer_conf = kafka_conf # self.producer = None self.consumer = None self.MAGIC_BYTES = 0 def init_producer(self): pass def init_consumer(self, schema_registry_url, topics): logger.info("Initializing avro consumer") self.consumer = Consumer(self.consumer_conf) logger.info(f"Schema registry url: {schema_registry_url}") self.register_client = CachedSchemaRegistryClient( url=schema_registry_url) logger.info(f"Subscribing to topics: {topics}") self.topics = topics self.consumer.subscribe(self.topics) @staticmethod def delivery_report(err, msg): """ Called once for each message produced to indicate delivery result. Triggered by poll() or flush(). """ if err is not None: logger.error('Message delivery failed: {}'.format(err)) else: logger. \ info('Message delivered to {} [{}]'.format(msg.topic(), msg.partition())) def produce(self, messages): pass def consume(self, db_manager=None): logger.info("Consuming") while True: try: msg = self.consumer.poll(timeout=1) except SerializerError as e: logger.error( "Message deserialization failed for {}: {}".format(msg, e)) raise SerializerError if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: logger.info('%% %s [%d] reached end at offset %d' % (msg.topic(), msg.partition(), msg.offset())) continue logger.error("AvroConsumer error: {}".format(msg.error())) return key, value = self._unpack(msg.key()), self._unpack(msg.value()) logger.info(f"Message: {key}, {value}") if db_manager: if not value['BEFORE']: value['BEFORE'] = {} if not value['AFTER']: value['AFTER'] = {} sql = f"""INSERT INTO public.fdw_kafka(key, BEFORE, AFTER, FLIGHT_URL, LEG_URL, AFTER_RAW_DATA) values ('{key}', '{value['BEFORE']}', '{value['AFTER']}', '{value['FLIGHT_URL']}', '{value['LEG_URL']}', '');""" db_manager.execute(sql) def _unpack(self, payload): magic, schema_id = struct.unpack('>bi', payload[:5]) # Get Schema registry # Avro value format if magic == self.MAGIC_BYTES: schema = self.register_client.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) abc = reader.read(output) return abc # String key else: # If KSQL payload, exclude timestamp which is inside the key. # payload[:-8].decode() return payload.decode()
class KafkaConsumer(multiprocessing.Process): Conf = { 'offset': 0, } ConsumerProperties = {'auto.offset.reset': 'earliest'} DebugOptions = {'debug': 'all', 'log_level': '0'} consumer = None RegistryClient = None def __init__(self, conf=None): self.getconf(conf) self.initSchemaRegistry() self.initConsumer() if (config.debug >= 3): logging.basicConfig( format= '%(asctime)s.%(msecs)s:%(name)s:%(thread)d:%(levelname)s:%(process)d:%(message)s', level=logging.DEBUG) def getconf(self, conf): self.Conf.update(conf['services']['Kafka']) self.Conf.update(conf['args']) try: Properties = self.Conf['properties'] except: self.Conf['properties'] = {} def initSchemaRegistry(self): try: RegistryConfig = {'url': self.Conf['schema.registry']} except: return self.RegistryClient = CachedSchemaRegistryClient(**RegistryConfig) debug(level=1, RegistryClient=self.RegistryClient) def assignPartitions(self, consumer, partitions): for p in partitions: p.offset = int(self.Conf['offset']) consumer.assign(partitions) def initConsumer(self): ConsumerConfig = { 'bootstrap.servers': str.join(',', self.Conf['brokers']), 'group.id': self.Conf['groupid'] } ConsumerProperties = {} if (config.debug >= 3): ConsumerConfig.update(self.DebugOptions) trace(ConsumerConfig) ConsumerProperties.update(self.ConsumerProperties) ConsumerProperties.update(self.Conf['properties']) ConsumerConfig.update(ConsumerProperties) self.consumer = Consumer(ConsumerConfig) self.consumer.subscribe([self.Conf['topic']], on_assign=self.assignPartitions) def unpack(self, payload): magic, schema_id = struct.unpack('>bi', payload[:5]) if magic == MAGIC_BYTES: schema = self.RegistryClient.get_by_id(schema_id) reader = DatumReader(schema) output = BinaryDecoder(io.BytesIO(payload[5:])) abc = reader.read(output) return abc else: return payload.decode() def readMessageByPartitionOffsetAvro(self): _count = False print('polling ', end='', flush=True) while True: try: msg = self.consumer.poll(1) except SerializerError as e: if _count: print('SerializerError') print("Message deserialization failed for {}: {}".format( msg, e)) raise SerializerError if msg is None: _count = True print('.', end='', flush=True) continue if msg.error(): if _count: print('msg.error') print("AvroConsumer error: {}".format(msg.error())) continue key, value = self.unpack(msg.key()), self.unpack(msg.value()) if _count: print('ok') return value