def get_indexes_for_auto_config(self, **autoconf):
     log.debug('Attempting to autoconfigure ES indices')
     ignored_topics = set(autoconf.get('ignored_topics', []))
     log.debug('Ignoring topics: %s' % (ignored_topics, ))
     log.debug('Previously Configured topics: %s' %
               (self.autoconfigured_topics, ))
     # have to get to force env lookups
     args = kafka_config.copy()
     try:
         consumer = KafkaConsumer(**args)
         topics = [
             i for i in consumer.topics() if i not in ignored_topics
             and i not in self.autoconfigured_topics
         ]
     except Exception as ke:
         log.error('Autoconfig failed to get available topics \n%s' % (ke))
         return []  # Can't auto-configure if Kafka isn't available
     geo_point = (autoconf.get('geo_point_name', None) if autoconf.get(
         'geo_point_creation', False) else None)
     auto_ts = autoconf.get('auto_timestamp', None)
     indexes = []
     for topic_name in topics:
         self.autoconfigured_topics.append(topic_name)
         index_name = (autoconf.get('index_name_template') %
                       topic_name).lower()
         log.debug('Index name => %s' % index_name)
         indexes.append({
             'name':
             index_name,
             'body':
             self.get_index_for_topic(topic_name, geo_point, auto_ts)
         })
     return indexes
Beispiel #2
0
 def get_consumer(self, kconf, kafka_group, topics=[]):
     try:
         if not topics:
             raise WorkerException('No topics specified')
         args = kconf.copy()
         args['group_id'] = kafka_group
         consumer = KafkaConsumer(**args)
         consumer.subscribe(topics)
         return consumer
     except Exception as ke:
         self.status = WorkerStatus.ERR_KAFKA
         LOG.warning(f'Worker {self._id}: Could not connect to Kafka: {ke}')
         raise WorkerException('No Kafka Connection')
def connect_kafka():
    for x in range(CONN_RETRY):
        try:
            # have to get to force env lookups
            args = kafka_config.copy()
            consumer = KafkaConsumer(**args)
            consumer.topics()
            log.debug('Connected to Kafka...')
            return
        except Exception as ke:
            log.debug('Could not connect to Kafka: %s' % (ke))
            sleep(CONN_RETRY_WAIT_TIME)
    log.critical('Failed to connect to Kafka after %s retries' % CONN_RETRY)
    sys.exit(1)  # Kill consumer with error
 def connect(self, kafka_config):
     # have to get to force env lookups
     args = kafka_config.copy()
     args['client_id'] = self.group_name
     args['group_id'] = self.group_name
     try:
         log.debug(f'Kafka CONFIG [{self.thread_id}]'
                   f'[{self.index}:{self.group_name}]')
         log.debug(json.dumps(args, indent=2))
         self.consumer = KafkaConsumer(**args)
         self.consumer.subscribe([self.topic])
         log.debug('Consumer %s subscribed on topic: %s @ group %s' %
                   (self.index, self.topic, self.group_name))
         return True
     except Exception as ke:
         log.error('%s failed to subscibe to topic %s with error \n%s' %
                   (self.index, self.topic, ke))
         return False
class KafkaViewer(object):

    def __init__(self, interactive=True):
        if interactive:
            self.start()

    def start(self):
        self.killed = False
        signal.signal(signal.SIGINT, self.kill)
        signal.signal(signal.SIGTERM, self.kill)
        clear()
        self.topics()

    def ask(self, options):
        bold("Select an option from the list\n")
        for x, opt in enumerate(options, 1):
            line = "%s ) %s" % (x, opt)
            norm(line)
        while True:
            x = input("choices: ( %s ) : " %
                      ([x + 1 for x in range(len(options))]))
            try:
                res = options[int(x) - 1]
                return res
            except Exception as err:
                error("%s is not a valid option | %s" % (x, err))

    def get_consumer(self, quiet=False, topic=None):
        args = {}
        kafka_settings_path = os.environ['KAFKA_CONFIG']
        with open(kafka_settings_path) as f:
            args = json.load(f)
        if not quiet:
            clear()
            pjson(["Creating Consumer from %s args:" % kafka_settings_path, args])
        self.connect_consumer(**args)
        if not self.consumer:
            raise Exception('Could not connect to Kafka')
            sys.exit(1)
        if topic:
            self.consumer.subscribe(topic)

    # Just need something for the unit test to do as this is almost 100% integration
    def consumer_connected(self):
        try:
            if self.consumer:
                return True
            return False
        except AttributeError:
            return False

    # Just need something for integration tests as everything else needs lots of mocking.
    def connect_consumer(self, *args, **kwargs):
        try:
            self.consumer = KafkaConsumer(**kwargs)
        except Exception as err:
            self.consumer = None

    def kill(self, *args, **kwargs):
        self.killed = True

    def count_all_topic_messages(self, topic):  # individual messages
        subtotals = []
        self.get_consumer(True, topic)
        while not self.consumer.poll():
            try:
                self.consumer.seek_to_beginning()
                break
            except AssertionError:
                LOG.info(f'{topic} waiting for consumer poll...')
                sleep(0.25)
        try:
            while True:
                messages = self.consumer.poll_and_deserialize(1000, 1000)
                if not messages:
                    return sum(subtotals)
                parts = [i for i in messages.keys()]
                for part in parts:
                    bundles = messages.get(part)
                    for bundle in bundles:
                        _msgs = bundle.get('messages')
                        subtotals.append(sum([1 for m in _msgs]))
        except Exception as err:
            raise err
        finally:
            self.consumer.close()

    def get_topic_size(self, topic):  # offsets
        self.get_consumer(True, topic)
        while not self.consumer.poll():
            try:
                self.consumer.seek_to_end()
                break
            except AssertionError:
                LOG.info(f'{topic} waiting for consumer poll...')
                sleep(0.25)

        partitions = [TopicPartition(topic, p) for p in self.consumer.partitions_for_topic(topic)]
        end_offsets = self.consumer.end_offsets(partitions)
        self.consumer.close(autocommit=False)
        size = sum([i for i in end_offsets.values()])
        return size

    def print_topic_sizes(self, topics):
        clear()
        bold("Topic -> Count")
        for t in topics:
            size = self.count_all_topic_messages(t)
            norm("%s -> %s" % (t, size))
        wait()
        clear()
        return

    def topics(self):
        while True:
            topic_size = {}
            self.get_consumer(quiet=True)
            refresh_str = "-> Refresh Topics"
            detailed_str = "-> Get Real Message Counts for Topics"
            quit_str = "-> Exit KafkaViewer\n"
            LOG.info(f'fetching topics')
            bold('Fetching Topics')
            topics = None
            with timeout(5):
                topics = sorted([i for i in self.consumer.topics() if i not in EXCLUDED_TOPICS])
                LOG.info(f'found topics {topics}')
                bold(f'Inspecting topics {topics}')
            if not topics:
                LOG.info(f'no topics found')
                bold("No topics available")
            for topic in topics:
                LOG.info(f'inspecting topic: {topic}')
                bold(f'Inspecting topic: {topic}')
                try:
                    with timeout(5):
                        topic_size[topic] = self.get_topic_size(topic)
                        LOG.info(f'{topic} size: {topic_size[topic]}')
                        bold('...Success')
                except TimeoutError:
                    topic_size[topic] = 'timed-out'
                    bold('...Failure!')

            clear()
            prompt_key = { "topic: %s {%s}" % (topic, topic_size[topic]) : topic for topic in topics }
            prompts = sorted(prompt_key.keys())
            prompts.extend([detailed_str, refresh_str, quit_str])
            bold("Choose a Topic to View")
            norm("-> topic {# of offsets in topic}\n")
            topic = self.ask(prompts)
            topic = prompt_key.get(topic) if topic in prompt_key else topic
            if topic is quit_str:
                return
            elif topic is refresh_str:
                clear()
                continue
            elif topic is detailed_str:
                self.print_topic_sizes(topics)
                continue

            self.get_consumer(topic=topic)
            self.consumer.seek_to_beginning()
            self.show_topic()

    def show_topic(self, batch_size=50):
        current = 0
        while True:
            messages = self.consumer.poll_and_deserialize(1000, batch_size)
            if not messages:
                norm("No more messages available!")
                return
            part = 0
            choices = [i for i in messages.keys()]
            if len(choices) > 1:
                bold("Choose a Parition to View")
                part = self.ask(choices)
                messages = messages.get(choices[part])
            else:
                messages = messages.get(choices[0])
            messages_read = self.view_messages(messages, batch_size, current)
            if not messages_read:
                return
            current += messages_read

    def view_messages(self, messages, batch_size, current):
        options = [
            "Next Message",
            "Skip forward to next package of messages",
            "View Current Schema",
            "Exit to List of Available Topics\n"
        ]
        pulled_size = sum([1 for message in messages for msg in message.get('messages') ])
        for x, message in enumerate(messages):
            if not message.get('messages'):
                bold('\nThe current settings return no messages\n')
            for y, msg in enumerate(message.get('messages')):
                norm("message #%s (%s of batch sized %s)" %
                     (current + 1, y + 1, pulled_size))
                pjson(msg)
                res = self.ask(options)
                idx = options.index(res)
                if idx == 1:
                    clear()
                    return pulled_size
                elif idx == 2:
                    pjson(message.get('schema'))
                    wait()
                elif idx == 3:
                    clear()
                    return False
                else:
                    clear()
                current += 1

    def view_topics(self):
        with timeout(5):
            self.get_consumer(quiet=True)
            return sorted([i for i in self.consumer.topics() if not i in EXCLUDED_TOPICS])
 def connect_consumer(self, *args, **kwargs):
     try:
         self.consumer = KafkaConsumer(**kwargs)
     except Exception as err:
         self.consumer = None
class ESConsumer(threading.Thread):
    # A single consumer subscribed to topic, pushing to an index
    # Runs as a daemon to avoid weird stops
    def __init__(self, index, processor, has_group=True, doc_type=None):
        # has_group = False only used for testing
        self.processor = processor
        self.index = index
        self.doc_type = doc_type
        self.es_type = processor.es_type
        self.topic = processor.topic_name
        self.consumer_timeout = 1000  # MS
        self.consumer_max_records = 1000
        kafka_topic_template = consumer_config.get(
            'kafka_topic_template', 'elastic_{es_index_name}_{data_type}')
        self.group_name = kafka_topic_template.format(
            es_index_name=self.index,
            data_type=self.es_type)\
            if has_group else None
        self.sleep_time = 10
        self.stopped = False
        self.consumer = None
        self.thread_id = 0
        super(ESConsumer, self).__init__()

    def connect(self, kafka_config):
        # have to get to force env lookups
        args = kafka_config.copy()
        args['client_id'] = self.group_name
        args['group_id'] = self.group_name
        try:
            log.debug(f'Kafka CONFIG [{self.thread_id}]'
                      f'[{self.index}:{self.group_name}]')
            log.debug(json.dumps(args, indent=2))
            self.consumer = KafkaConsumer(**args)
            self.consumer.subscribe([self.topic])
            log.debug('Consumer %s subscribed on topic: %s @ group %s' %
                      (self.index, self.topic, self.group_name))
            return True
        except Exception as ke:
            log.error('%s failed to subscibe to topic %s with error \n%s' %
                      (self.index, self.topic, ke))
            return False

    def run(self):
        self.thread_id = threading.get_ident()
        log.debug(
            f'Consumer [{self.thread_id}] running on {self.index} : {self.es_type}'
        )
        while True:
            if self.connect(kafka_config):
                break
            elif self.stopped:
                return
            sleep(2)
        last_schema = None
        while not self.stopped:
            new_messages = self.consumer.poll_and_deserialize(
                timeout_ms=self.consumer_timeout,
                max_records=self.consumer_max_records)
            if not new_messages:
                log.info(f'Kafka IDLE [{self.thread_id}]'
                         f'[{self.index}:{self.group_name}]')
                sleep(5)
                continue
            for parition_key, packages in new_messages.items():
                for package in packages:
                    schema = package.get('schema')
                    messages = package.get('messages')
                    log.debug('messages #%s' % len(messages))
                    if schema != last_schema:
                        log.info('Schema change on type %s' % self.es_type)
                        log.debug('schema: %s' % schema)
                        self.processor.load_avro(schema)
                        self.get_route = self.processor.create_route()
                    else:
                        log.debug('Schema unchanged.')
                    count = 0
                    for x, msg in enumerate(messages):
                        doc = self.processor.process(msg)
                        count = x
                        log.debug(f'Kafka READ [{self.thread_id}]'
                                  f'[{self.index}:{self.group_name}]'
                                  f' -> {doc.get("id")}')
                        self.submit(doc)
                    log.debug(f'Kafka COMMIT [{self.thread_id}]'
                              f'[{self.index}:{self.group_name}]')
                    self.consumer.commit_async(callback=self.report_commit)
                    log.info('processed %s docs in index %s' %
                             ((count + 1), self.es_type))
                    last_schema = schema

        log.info('Shutting down consumer %s | %s' % (self.index, self.topic))
        self.consumer.close(autocommit=True)
        return

    def report_commit(self, offsets, response):
        log.info(f'Kafka OFFSET CMT {offsets} -> {response}')

    def submit(self, doc, route=None):
        parent = doc.get('_parent', None)
        if parent:  # _parent field can only be in metadata apparently
            del doc['_parent']
        try:
            if ES_VERSION > 5:
                route = self.get_route(doc)
                '''
                if route:
                    log.debug(doc)
                    log.debug(route)
                '''
                es.create(index=self.index,
                          id=doc.get('id'),
                          routing=route,
                          doc_type=self.doc_type,
                          body=doc)
            else:
                es.create(index=self.index,
                          doc_type=self.es_type,
                          id=doc.get('id'),
                          parent=parent,
                          body=doc)
            log.debug(f'ES CREATE-OK [{self.thread_id}]'
                      f'[{self.index}:{self.group_name}]'
                      f' -> {doc.get("id")}')

        except (Exception, TransportError) as ese:
            log.info(
                'Could not create doc because of error: %s\nAttempting update.'
                % ese)
            try:
                if ES_VERSION > 5:
                    route = self.get_route(doc)
                    es.update(index=self.index,
                              id=doc.get('id'),
                              routing=route,
                              doc_type=self.doc_type,
                              body=doc)
                else:
                    es.update(index=self.index,
                              doc_type=self.es_type,
                              id=doc.get('id'),
                              parent=parent,
                              body=doc)
                log.debug(f'ES UPDATE-OK [{self.thread_id}]'
                          f'[{self.index}:{self.group_name}]'
                          f' -> {doc.get("id")}')
            except TransportError as te:
                log.debug('conflict exists, ignoring document with id %s' %
                          doc.get('id', 'unknown'))

    def stop(self):
        log.info('%s caught stop signal' % (self.group_name))
        self.stopped = True