Esempio n. 1
0
    def loadavsc(self, avscid):
        # global  self.avscmap
        self.__logger.debug("In loadavsc with avscid: %s" % avscid)
        avsc = None
        self.__logger.debug(
            "lib_pmgrpcd.OPTIONS.urlscreg: %s lib_pmgrpcd.OPTIONS.calocation: %s"
            % (lib_pmgrpcd.OPTIONS.urlscreg, lib_pmgrpcd.OPTIONS.calocation))

        try:
            self.__logger.debug(
                "Instancing client (CachedSchemaRegistryClient) with avscid:%s url:%s ssl.ca.location:%s",
                avscid,
                lib_pmgrpcd.OPTIONS.urlscreg,
                lib_pmgrpcd.OPTIONS.calocation,
            )
            client = CachedSchemaRegistryClient(
                url=lib_pmgrpcd.OPTIONS.urlscreg,
                ca_location=lib_pmgrpcd.OPTIONS.calocation,
            )
        except Exception as e:
            self.__logger.info(
                "ERROR: load avro schema from schema-registry-server is failed on CachedSchemaRegistryClient on using method get_by_id()"
            )
            self.__logger.info("ERROR: %s" % (e))
            return avsc

        try:
            avsc = client.get_by_id(avscid)
        except Exception as e:
            self.__logger.info(
                "ERROR: load avro schema from schema-registry-server is failed on CachedSchemaRegistryClient on using method get_by_id()"
            )
            self.__logger.info("ERROR: %s" % (e))
            return avsc

        try:
            avsc_dict = json.loads(str(avsc))
        except Exception as e:
            self.__logger.info(
                "ERROR: json.loads of the avsc_str is faild to produce a dict")
            self.__logger.info("ERROR: %s" % (e))
            return avsc

        self.__logger.info("SCHEMA_OF_ID(%s): %s" %
                           (avscid, avsc_dict["name"]))

        # Query Schema-Registry
        # self.jsonmap = json.load(mapfile)
        if avscid in self.avscmap:
            self.__logger.debug(
                "Update  self.avscmap the existing record avscid (%s) with avroschema"
                % avscid)
            self.avscmap[avscid].update({"avsc": avsc_dict})
        else:
            self.__logger.debug(
                "Update  self.avscmap with new record avscid (%s) with avroschema"
                % avscid)
            self.avscmap.update({avscid: {"avsc": avsc_dict}})

        return avsc
Esempio n. 2
0
def loadavsc(avscid):
  global avscmap
  global options
  serializelog.debug("In loadavsc with avscid: %s" % avscid)
  avsc = None
  serializelog.debug("options.urlscreg: %s options.calocation: %s" % (options.urlscreg, options.calocation))

  try:
    serializelog.debug("querying screg with avscid: %s" % (avscid))
    client = CachedSchemaRegistryClient({'url':options.urlscreg, 'ssl.ca.location':options.calocation})
    avsc = client.get_by_id(avscid)
  except Exception as e:
    serializelog.info("ERROR: load avro schema from schema-registry-server is failed on CachedSchemaRegistryClient on using method get_by_id()")
    serializelog.info("ERROR: %s" % (e))

  try:
    avsc_dict = json.loads(str(avsc))
  except Exception as e:
    serializelog.info("ERROR: json.loads of the avsc_str is faild to produce a dict")
    serializelog.info("ERROR: %s" % (e))

  serializelog.info("SCHEMA_OF_ID(%s): %s" % (avscid, avsc_dict["name"]))

  #Query Schema-Registry
  #jsonmap = json.load(mapfile)
  if avscid in avscmap:
    serializelog.debug("Update avscmap the existing record avscid (%s) with avroschema" % avscid)
    avscmap[avscid].update({"avsc": avsc_dict})
  else:
    serializelog.debug("Update avscmap with new record avscid (%s) with avroschema" % avscid)
    avscmap.update({avscid:{"avsc": avsc_dict}})

  return avsc
Esempio n. 3
0
class _AvroIORegistry:
    def __init__(self, schema_registry_url):
        """Private implementation class for Avro IO using the registry"""
        log.info(
            f"Using registry with schema_url/id {schema_registry_url}/{config.SCHEMA_ID}"
        )
        try:
            self.client = CachedSchemaRegistryClient(url=schema_registry_url)
            self.schema = self.client.get_by_id(config.SCHEMA_ID)
            self.serializer = MessageSerializer(self.client)
        except:
            raise ValueError("Client id or schema id not found")

    def decode(self, bytes):
        return self.serializer.decode_message(bytes)

    def encode(self, record):
        return self.serializer.encode_record_with_schema_id(
            config.SCHEMA_ID, record)
class LocationKafkaListerner(object):
    __instance = None

    @staticmethod
    def create(app):
        if LocationKafkaListerner.__instance is None:
            LocationKafkaListerner(app)
        else:
            return LocationKafkaListerner.__instance

    def __init__(self, app):
        print("2")
        LocationKafkaListerner.__instance = self
        self.app = app
        self.config = Config.getInstance()
        self.register_client = CachedSchemaRegistryClient(
            url=app.config['KAFKA_SCHEMA_REGISTRY_URL']
        )
        print("3")
        self.client = self.config.getESClient()
        threading.Thread(target=self.readJobsData).start()
        threading.Thread(target=self.readMappingsData).start()
        threading.Thread(target=self.readSubmissionsData).start()

    def readJobsData(self):
        print("4")
        kafkaConsumer = self.config.getKafkaConsumer(
            'locationsearch_job_entity' + str(uuid.uuid1()))
        kafkaConsumer.subscribe(['job_entity'])
        self.get_data(kafkaConsumer)

    def readMappingsData(self):
        kafkaConsumer = self.config.getKafkaConsumer(
            'locationsearch_jobCandidateMapping')
        kafkaConsumer.subscribe(['jobcandidatemapping_entity'])
        self.get_data(kafkaConsumer)

    def readSubmissionsData(self):
        kafkaConsumer = self.config.getKafkaConsumer(
            'locationsearch_jobCandidateInteraction')
        kafkaConsumer.subscribe(['jobcandidateinteraction_entity'])
        self.get_data(kafkaConsumer)

    def get_data(self, consumer):
        print("5");
        while True:
            try:
                print("getting message")
                msg = consumer.poll(10)
                print("after getting a message")
            except SerializerError as e:
                print("Message deserialization failed for {}: {}".format(msg, e))
                raise SerializerError
            except:
                print('An error occurred.')

            print(msg)
            if msg:
                if msg.error():
                    print("AvroConsumer error: {}".format(msg.error()))
                    return
                self.parseLocation(unpack(msg.value()))
            else:
                print("No Message!!")

    def parseLocation(self, message):
        entity = json.loads(json.dumps(message))
        if(entity['locations']):
            # Message from job index
            for l in entity['locations']:
                self.buildLocationLookupEntity(l)
        else:
            # Message from candidate index
            self.buildLocationLookupEntity(message.currentLocation)
            self.buildLocationLookupEntity(message.preferredLocations)

    def buildLocationLookupEntity(self, location):
        latlng = location['point'].split(",")
        if(location['point'] == '' or location['point'] is None or (float(latlng[0]) == 0 and float(latlng[1]) == 0)):
            print('no location latlng = {}'.format(latlng))
        else:
            addressComponents = []
            locationLookup = {}
            if location['city']:
                addressComponents.append(location['city'])
            if location['state']:
                addressComponents.append(location['state'])
            elif location['stateCode']:
                addressComponents.append(location['stateCode'])
            if location['country']:
                addressComponents.append(location['country'])
            elif location['countryCode']:
                addressComponents.append(location['countryCode'])
            if(addressComponents.count == 0):
                if location['continent']:
                    addressComponents.append(location['continent'])
                elif location['continentCode']:
                    addressComponents.append(location['continentCode'])

            locationLookup['id'] = str(uuid.uuid4())
            locationLookup['keywords'] = ", ".join(addressComponents)
            locationLookup['city'] = location['city']
            locationLookup['state'] = location['state']
            locationLookup['stateCode'] = location['stateCode']
            locationLookup['country'] = location['country']
            locationLookup['countryCode'] = location['countryCode']
            locationLookup['continent'] = location['continent']
            locationLookup['continentCode'] = location['continentCode']
            locationLookup['zipCode'] = location['zipCode']
            response = self.client.search(
                index="location_lookup",
                body={
                    "size": 1,
                    "query": {
                        "term": {
                            "keywords.lowercase": locationLookup['keywords'].lower()
                        }
                    }
                }
            )

            if(response['hits']['total'] == 0):
                print("Indexing " + locationLookup['keywords'])
                print(self.client.index(
                    index='location_lookup',
                    doc_type='location_lookup',
                    id=locationLookup['id'],
                    refresh='wait_for',
                    body=locationLookup)
                )
            else:
                print("Ignoring" + locationLookup['keywords'])

    def unpack(self, payload):
        MAGIC_BYTES = 0
        magic, schema_id = struct.unpack('>bi', payload[:5])
        # Get Schema registry
        # Avro value format
        if magic == MAGIC_BYTES:
            schema = self.register_client.get_by_id(schema_id)
            reader = DatumReader(schema)
            output = BinaryDecoder(io.BytesIO(payload[5:]))
            abc = reader.read(output)
            return abc
        # String key
        else:
            # Timestamp is inside my key
            return payload[:-8].decode()
Esempio n. 5
0
class CatalogExporter(VtvTask):
    def __init__(self):
        VtvTask.__init__(self)
        self.catalogId = self.options.cust_id
        self.config = yaml.load(open(self.options.config_file, 'r'),
                                Loader=yaml.FullLoader)[self.catalogId]
        self.consumer = self.init_consumer()
        self.out_dir = self.config['out_dir']
        make_dir(self.out_dir)
        self.s3Location = self.config['s3Location']

    def init_consumer(self):
        bootstrap_server = self.config['bootstrap-server']
        schema_url = self.config['schema-registery-url']
        # KAFKA BROKER URL
        consumer = Consumer({
            'bootstrap.servers': bootstrap_server,
            'group.id': 'catalog-export-%s' % self.catalogId,
            'auto.offset.reset': 'earliest'
        })

        # SCHEMA URL
        self.register_client = CachedSchemaRegistryClient(url=schema_url)
        consumer.subscribe(['catserver-%s-catalog' % self.catalogId],
                           on_assign=self.my_on_assign)
        return consumer

    def my_on_assign(self, consumer, partitions):
        for p in partitions:
            # some starting offset, or use OFFSET_BEGINNING, et, al.
            # the default offset is STORED which means use committed offsets, and if
            # no committed offsets are available use auto.offset.reset config (default latest)
            p.offset = 0
            # call assign() to start fetching the given partitions.
        consumer.assign(partitions)

    def decode_avro(self, payload):
        magic, schema_id = struct.unpack('>bi', payload[:5])

        # Get Schema registry
        # Avro value format
        if magic == MAGIC_BYTES:
            schema = self.register_client.get_by_id(schema_id)
            reader = DatumReader(schema)
            output = BinaryDecoder(io.BytesIO(payload[5:]))
            decoded = reader.read(output)
            return decoded, schema.name
        # no magic bytes, something is wrong
        else:
            raise ValueError

    def generate_catalog(self):
        output_file = os.path.join(
            self.out_dir,
            "catalog-%s-%s.gz" % (self.catalogId, round(time.time() * 1000)))
        output_file_fp = gzip.open(output_file, "wt")
        catalogTopicPartition = TopicPartition(
            "catserver-%s-catalog" % self.catalogId, 0, 0)
        lastMsgToRead = self.consumer.get_watermark_offsets(
            catalogTopicPartition)[1] - 1
        current_offset = 0
        print(lastMsgToRead)
        record_list = []
        #while current_offset < lastMsgToRead:
        #count = lastMsgToRead
        first_pass = True
        cnt = 0
        rec = 0
        #while count > lastMsgToRead - 10 && count <= lastMsgToRead:
        while current_offset < 30:
            try:
                msg_list = self.consumer.consume(5, 100)
            except SerializerError as e:
                print("Message deserialization failed for {}: {}".format(
                    msg_list, e))
                raise SerializerError

            for msg in msg_list:
                if msg.error():
                    print("AvroConsumer error: {}".format(msg.error()))
                    return

                msg_id = msg.key()
                print(msg.offset(), msg_id)
                if first_pass:
                    ids_dict[msg_id] = msg.offset()
                else:
                    if msg.value() is None:
                        message = None
                        content_type = "PROGRAM"
                    else:
                        message, content_type = self.decode_avro(msg.value())
                    if ids_dict[msg_id] == msg.offset():
                        rec = rec + 1
                        record_list.append(
                            OrderedDict([("id", msg_id.decode("utf-8")),
                                         ("type", content_type),
                                         ("offset", msg.offset()),
                                         ("content", message)]))
                    if len(record_list) >= 10000:
                        output_file_fp.write('\n'.join(
                            json.dumps(record)
                            for record in record_list) + "\n")
                        del record_list[:]
                #count  = msg.offset()
                current_offset = msg.offset()
                cnt = cnt + 1
                self.logger.info(
                    "Continuing to the processes. Currently at offset {}/{}".
                    format(current_offset, lastMsgToRead))
            if first_pass and current_offset == lastMsgToRead:
                first_pass = False
                current_offset = 0
                catalogTopicPartition = TopicPartition(
                    "catserver-%s-catalog" % self.catalogId, 0, 0)
                self.consumer.seek(catalogTopicPartition)
        self.consumer.close()

        if len(record_list) > 0:
            output_file_fp.write('\n'.join(
                json.dumps(record) for record in record_list))

        output_file_fp.close()
        print(
            s3_utils.upload_file_to_s3(output_file, self.s3Location,
                                       self.logger))
        print("size in Bytes: %d" % sys.getsizeof(ids_dict))
        print("unique records:  %d" % len(ids_dict))
        print("Last msg offset: %d" % lastMsgToRead)
        print("No of records: %d" % cnt)
        print("No of written records: %d" % rec)

    def run_main(self):
        self.generate_catalog()

    def set_options(self):
        config_file = os.path.join(self.system_dirs.VTV_ETC_DIR,
                                   'exporter_cfg.yaml')
        self.parser.add_option('-c',
                               '--config-file',
                               default=config_file,
                               help='configuration file')
        self.parser.add_option('-t', '--cust-id', help="name of the customer")

    def cleanup(self):
        self.move_logs(self.out_dir, [('.', '*log')])
Esempio n. 6
0
class CatalogExporter(VtvTask):
    def __init__(self):
        VtvTask.__init__(self)
        self.catalogId = self.options.cust_id
        print(self.catalogId)
        self.config = yaml.load(open(self.options.config_file, 'r'), Loader=yaml.FullLoader)[self.catalogId]
        self.consumer = self.init_consumer()
        self.out_dir = self.config['out_dir']
        make_dir(self.out_dir)
        self.s3Location = self.config['s3Location']


    def init_consumer(self):
        bootstrap_server = self.config['bootstrap-server']
        schema_url = self.config['schema-registery-url']
        # KAFKA BROKER URL
        consumer = Consumer({
            'bootstrap.servers': bootstrap_server, 
            'group.id': 'catalog-export-%s' %self.catalogId,
            'auto.offset.reset': 'earliest'
        })

        # SCHEMA URL
        self.register_client = CachedSchemaRegistryClient(url=schema_url)
        consumer.subscribe(['catserver-%s-catalog' % self.catalogId], on_assign=self.my_on_assign)
        return consumer

    def my_on_assign(self, consumer, partitions):
        for p in partitions:
            # some starting offset, or use OFFSET_BEGINNING, et, al.
            # the default offset is STORED which means use committed offsets, and if
            # no committed offsets are available use auto.offset.reset config (default latest)
            p.offset = 0
            # call assign() to start fetching the given partitions.
        consumer.assign(partitions)

    def decode_avro(self, payload):
        magic, schema_id = struct.unpack('>bi', payload[:5])

        # Get Schema registry
        # Avro value format
        if magic == MAGIC_BYTES:
            schema = self.register_client.get_by_id(schema_id)
            reader = DatumReader(schema)
            output = BinaryDecoder(io.BytesIO(payload[5:]))
            decoded = reader.read(output)
            return decoded, schema.name
        # no magic bytes, something is wrong
        else:
            raise ValueError

    def generate_catalog(self):
        #output_file = os.path.join(self.out_dir, "catalog-%s-%s.gz" % (self.catalogId, round(time.time() * 1000)))
        output_file = os.path.join(self.out_dir, "catalog-%s-%s.gz" % (self.catalogId, TIME_STAMP))
        output_file_fp = OUTFILE_MAP[output_file]
        available_ids_file = os.path.join(self.out_dir, AVAILABLE_IDS_FILE)
        catalogTopicPartition = TopicPartition("catserver-%s-catalog" % self.catalogId, 0, 0)
        lastMsgToRead = self.consumer.get_watermark_offsets(catalogTopicPartition)[1] - 1
        current_offset = 0
        print(lastMsgToRead)
        record_list = []
        ps = self.consumer.list_topics("catserver-%s-catalog" % self.catalogId)
        print(ps.topics)
        cnt, rec, offer_content = 0, 0, 0
        batch_size = 500
        first_pass = True
        while current_offset < lastMsgToRead:  
            try:
                msg_list = self.consumer.consume(batch_size, 100)
            except SerializerError as e:
                print("Message deserialization failed for {}: {}".format(msg_list, e))
                raise SerializerError

            for msg in msg_list:
                if msg.error():
                    print("AvroConsumer error: {}".format(msg.error()))
                    return
                if msg.key() is None:
                    print("Key is None for offset {}".format(msg.offset()))

                msg_id = msg.key().decode("utf-8")
                if first_pass:
                    existing_offset = UNIQUE_IDS.get(msg_id)
                    if existing_offset is None:
                        if msg.value() is not None:
                            UNIQUE_IDS[msg_id] = msg.offset()
                            #message, content_type = self.decode_avro(msg.value())
                            #if content_type in ("VodOffer", "LinearBlock"):
                            #    self.parse_availability_first(content_type, message)
                            self.logger.info("msg_id: {}, msg_offset: {} added for the first time in dict".format(msg_id, msg.offset()))
                    elif msg.value() is None:
                        del UNIQUE_IDS[msg_id]
                        self.logger.info("Content is null for msg_id: {}, msg_offset: {}, existing offset: {}. Remove from dict".format(msg_id, msg.offset(), existing_offset))
                    else:
                        UNIQUE_IDS[msg_id] = msg.offset()
                        #message, content_type = self.decode_avro(msg.value())
                        #if content_type in ("VodOffer", "LinearBlock"):
                        #    self.parse_availability_first(content_type, message)
                else:
                    existing_offset = UNIQUE_IDS.get(msg_id)
                    if existing_offset is None:
                        self.logger.info("msg_id: {}, msg_offset: {} not present, msg.value(): {}".format(msg_id, msg.offset(), msg.value()))
                    elif existing_offset == msg.offset():
                        message, content_type = self.decode_avro(msg.value())
                        if content_type in ("VodOffer", "LinearBlock"):
                            offer_content = offer_content + 1
                            self.parse_availability(available_ids_file, content_type, message)
                        else:
                            rec = rec + 1
                            record_list.append(OrderedDict([("id", msg_id), ("type", content_type), ("offset", msg.offset()), ("content", message)]))
                            if len(record_list) >= 10000:
                                output_file_fp.write('\n'.join(json.dumps(record) for record in record_list) + "\n")
                                del record_list[:]

                current_offset = msg.offset()
                cnt = cnt + 1
                self.logger.info("Continuing to the processes. Currently at offset {}/{}".format(current_offset, lastMsgToRead))
            if first_pass and current_offset == lastMsgToRead:
                first_pass = False
                current_offset = 0
                print("No of records: %d first pass" % cnt)
                cnt = 0
                self.consumer.seek(catalogTopicPartition)
        self.consumer.close()

        if len(record_list) > 0:
            output_file_fp.write('\n'.join(json.dumps(record) for record in record_list) + "\n")

        '''
        output_file_fp.close()
        available_ids_file = os.path.join(self.out_dir, AVAILABLE_IDS_FILE)
        print(os.path.exists(available_ids_file))
        if os.stat(output_file).st_size > 0 and os.path.exists(available_ids_file):
            s3_upload_status = s3_utils.upload_file_to_s3(output_file, self.s3Location, self.logger)
            available_ids_file_upload = s3_utils.upload_file_to_s3(AVAILABLE_IDS_FILE, self.s3Location, self.logger)
            print("catalogue upload: %d, available_ids_file_upload: %d" % (s3_upload_status, available_ids_file_upload))
            self.logger.info("catalogue upload: {}, available_ids_file_upload: {}".format(s3_upload_status, available_ids_file_upload))
        '''
        print("size in Bytes: %d" % sys.getsizeof(UNIQUE_IDS))
        print("unique records:  %d" % len(UNIQUE_IDS))
        print("Last msg offset: %d" % lastMsgToRead)
        print("No of records: %d" % cnt)
        print("No of written records: %d" % rec)
        print("No of offer content records: %d" % offer_content)
        print("No of available IDs: %d" % len(AVAILABLE_IDS))

    def parse_availability(self, filename, content_type, content_info):
        #filename = os.path.join(self.out_dir, AVAILABLE_IDS_FILE)
        fp = OUTFILE_MAP.get(filename)
        #if not fp:
        #    fp = open(filename, 'w')
        #    OUTFILE_MAP[filename] = fp
        if not content_info:
            self.logger.info("Content Empty for ID: %s" % self.sk)
            return
        if not content_info:
            self.logger.info("Content Empty for Record: %s" % json_record)
            return
        if content_type == "LinearBlock":
            offers = self.get_value(content_info, "offers", [])
            for offer in offers:
                work_id = self.get_value(offer, "workId", "")
                series_id = self.get_value(offer, "seriesId", "")
                end_time = self.get_value(offer, "endTime", "")
                if end_time:
                    end_date = datetime.datetime.strptime(end_time.split('T')[0], '%Y-%m-%d').date()
                else:
                    end_date = DATE_TODAY
                if work_id and end_date >= DATE_TODAY:
                    fp.write('%s\n' % work_id)
                    AVAILABLE_IDS.add(work_id)
                if series_id and end_date >= DATE_TODAY:
                    fp.write('%s\n' % series_id)
                    AVAILABLE_IDS.add(series_id)
        else:
            work_id = self.get_value(content_info, "workId", "")
            series_id = self.get_value(content_info, "seriesId", "")
            end_time = self.get_value(content_info, "endTime", "")
            if end_time:
                end_date = datetime.datetime.strptime(end_time.split('T')[0], '%Y-%m-%d').date()
            else:
                end_date = DATE_TODAY
            if work_id and end_date >= DATE_TODAY:
                fp.write('%s\n' % work_id)
                AVAILABLE_IDS.add(work_id)
            if series_id and end_date >= DATE_TODAY:
                fp.write('%s\n' % series_id)
                AVAILABLE_IDS.add(series_id)

    def get_value(self, d, key, default):
        value = d.get(key)
        if value == None:
            value = default
        return value

    def create_files(self):
        try:
            catalogue_output_file = os.path.join(self.out_dir, "catalog-%s-%s.gz" % (self.catalogId, TIME_STAMP))
            catalogue_output_file_p = gzip.open(catalogue_output_file, "wt")
            OUTFILE_MAP[catalogue_output_file] = catalogue_output_file_p 
        except Exception as e:
            print(e)
            return -1

        try:
            available_ids_file = os.path.join(self.out_dir, AVAILABLE_IDS_FILE)
            available_ids_file_p = open(available_ids_file, 'w')
            OUTFILE_MAP[available_ids_file] = available_ids_file_p
        except Exception as e:
            print(e)
            return -1
        return 0


    def close(self):
        for outf in OUTFILE_MAP.values():
            outf.close()

    def upload(self):
        for key in OUTFILE_MAP:
            s3_upload_status = s3_utils.upload_file_to_s3(key, self.s3Location, self.logger)
            print("s3_upload_status: %d" % (s3_upload_status))
            self.logger.info("s3 upload status: {}".format(s3_upload_status))

    def run_main(self):
        if(self.create_files() == 0):
            self.generate_catalog()
            self.close()
            self.upload()

    def set_options(self):
        config_file = os.path.join(self.system_dirs.VTV_ETC_DIR, 'exporter_cfg.yaml')
        self.parser.add_option('-c', '--config-file', default=config_file, help='configuration file')
        self.parser.add_option('-t', '--cust-id', help="name of the customer")

    def cleanup(self):
        self.move_logs(self.out_dir, [('.', '*log')])
Esempio n. 7
0
class KafkaAvroProcessor(object):
    def __init__(self, kafka_conf):
        self.topics = None
        self.register_client = None
        self.consumer_conf = {
            'auto.offset.reset': 'earliest',
            'enable.partition.eof': True,
        }
        self.consumer_conf.update(kafka_conf)
        # self.producer_conf = kafka_conf
        # self.producer = None
        self.consumer = None
        self.MAGIC_BYTES = 0

    def init_producer(self):
        pass

    def init_consumer(self, schema_registry_url, topics):
        logger.info("Initializing avro consumer")
        self.consumer = Consumer(self.consumer_conf)
        logger.info(f"Schema registry url: {schema_registry_url}")
        self.register_client = CachedSchemaRegistryClient(
            url=schema_registry_url)
        logger.info(f"Subscribing to topics: {topics}")
        self.topics = topics
        self.consumer.subscribe(self.topics)

    @staticmethod
    def delivery_report(err, msg):
        """ Called once for each message produced to indicate delivery result.
            Triggered by poll() or flush(). """
        if err is not None:
            logger.error('Message delivery failed: {}'.format(err))
        else:
            logger. \
                info('Message delivered to {} [{}]'.format(msg.topic(), msg.partition()))

    def produce(self, messages):
        pass

    def consume(self, db_manager=None):
        logger.info("Consuming")
        while True:
            try:
                msg = self.consumer.poll(timeout=1)
            except SerializerError as e:
                logger.error(
                    "Message deserialization failed for {}: {}".format(msg, e))
                raise SerializerError

            if msg is None:
                continue

            if msg.error():
                if msg.error().code() == KafkaError._PARTITION_EOF:
                    logger.info('%% %s [%d] reached end at offset %d' %
                                (msg.topic(), msg.partition(), msg.offset()))
                    continue
                logger.error("AvroConsumer error: {}".format(msg.error()))
                return

            key, value = self._unpack(msg.key()), self._unpack(msg.value())
            logger.info(f"Message: {key}, {value}")
            if db_manager:
                if not value['BEFORE']: value['BEFORE'] = {}
                if not value['AFTER']: value['AFTER'] = {}
                sql = f"""INSERT INTO public.fdw_kafka(key, BEFORE, AFTER, FLIGHT_URL, LEG_URL, AFTER_RAW_DATA)
                            values ('{key}', '{value['BEFORE']}', '{value['AFTER']}', '{value['FLIGHT_URL']}', '{value['LEG_URL']}', '');"""
            db_manager.execute(sql)

    def _unpack(self, payload):
        magic, schema_id = struct.unpack('>bi', payload[:5])

        # Get Schema registry
        # Avro value format
        if magic == self.MAGIC_BYTES:
            schema = self.register_client.get_by_id(schema_id)
            reader = DatumReader(schema)
            output = BinaryDecoder(io.BytesIO(payload[5:]))
            abc = reader.read(output)
            return abc
        # String key
        else:
            # If KSQL payload, exclude timestamp which is inside the key.
            # payload[:-8].decode()
            return payload.decode()
Esempio n. 8
0
class KafkaConsumer(multiprocessing.Process):

    Conf = {
        'offset': 0,
    }

    ConsumerProperties = {'auto.offset.reset': 'earliest'}

    DebugOptions = {'debug': 'all', 'log_level': '0'}

    consumer = None
    RegistryClient = None

    def __init__(self, conf=None):
        self.getconf(conf)
        self.initSchemaRegistry()
        self.initConsumer()
        if (config.debug >= 3):
            logging.basicConfig(
                format=
                '%(asctime)s.%(msecs)s:%(name)s:%(thread)d:%(levelname)s:%(process)d:%(message)s',
                level=logging.DEBUG)

    def getconf(self, conf):
        self.Conf.update(conf['services']['Kafka'])
        self.Conf.update(conf['args'])
        try:
            Properties = self.Conf['properties']
        except:
            self.Conf['properties'] = {}

    def initSchemaRegistry(self):
        try:
            RegistryConfig = {'url': self.Conf['schema.registry']}
        except:
            return

        self.RegistryClient = CachedSchemaRegistryClient(**RegistryConfig)
        debug(level=1, RegistryClient=self.RegistryClient)

    def assignPartitions(self, consumer, partitions):
        for p in partitions:
            p.offset = int(self.Conf['offset'])

        consumer.assign(partitions)

    def initConsumer(self):
        ConsumerConfig = {
            'bootstrap.servers': str.join(',', self.Conf['brokers']),
            'group.id': self.Conf['groupid']
        }

        ConsumerProperties = {}

        if (config.debug >= 3):
            ConsumerConfig.update(self.DebugOptions)

        trace(ConsumerConfig)
        ConsumerProperties.update(self.ConsumerProperties)
        ConsumerProperties.update(self.Conf['properties'])
        ConsumerConfig.update(ConsumerProperties)
        self.consumer = Consumer(ConsumerConfig)
        self.consumer.subscribe([self.Conf['topic']],
                                on_assign=self.assignPartitions)

    def unpack(self, payload):
        magic, schema_id = struct.unpack('>bi', payload[:5])

        if magic == MAGIC_BYTES:
            schema = self.RegistryClient.get_by_id(schema_id)
            reader = DatumReader(schema)
            output = BinaryDecoder(io.BytesIO(payload[5:]))
            abc = reader.read(output)
            return abc
        else:
            return payload.decode()

    def readMessageByPartitionOffsetAvro(self):
        _count = False
        print('polling ', end='', flush=True)
        while True:
            try:
                msg = self.consumer.poll(1)
            except SerializerError as e:
                if _count:
                    print('SerializerError')
                print("Message deserialization failed for {}: {}".format(
                    msg, e))
                raise SerializerError

            if msg is None:
                _count = True
                print('.', end='', flush=True)
                continue

            if msg.error():
                if _count:
                    print('msg.error')
                print("AvroConsumer error: {}".format(msg.error()))
                continue

            key, value = self.unpack(msg.key()), self.unpack(msg.value())
            if _count:
                print('ok')
            return value