def main():
    happy_log_probs, sad_log_probs = readSentimentList(
        'twitter_sentiment_list.csv')

    consumer = KafkaConsumer("tweets", bootstrap_servers=["localhost:9092"],
                             auto_offset_reset='smallest')

    kafka = KafkaClient("localhost:9092")
    producer = SimpleProducer(kafka)
    topic = 'hashtag_sentiment'

    positive_tags = Counter()
    negative_tags = Counter()

    while True:
        for message in consumer.fetch_messages():
            txt = message.value
            txt = re.sub(r'[^\x00-\x7F]', ' ', txt)

            hashtags, sentiment = classifySentiment(
                txt, happy_log_probs, sad_log_probs)

            for hashtag in hashtags:
                if sentiment > 0:
                    positive_tags[hashtag] += 1
                else:
                    negative_tags[hashtag] += 1

        results = {}
        for key, val in positive_tags.most_common(20):
            results[key] = val

        producer.send_messages(topic, json.dumps(results))
        time.sleep(10)
Beispiel #2
0
 def read_groups(self):
     self.log.info("Kafka consumer running")
     self.consumer = KafkaConsumer(
         CONSUMER_OFFSET_TOPIC,
         group_id='offset_monitoring_consumer',
         bootstrap_servers=self.kafka_config.broker_list,
         auto_offset_reset='smallest',
         auto_commit_enable=False,
         consumer_timeout_ms=10000,
     )
     self.log.info("Consumer ready")
     self.watermarks = self.get_current_watermarks()
     while not self.finished():
         try:
             message = self.consumer.next()
             max_offset = self.get_max_offset(message.partition)
             if message.offset >= max_offset - 1:
                 self.finished_partitions.add(message.partition)
         except ConsumerTimeout:
             break
         except (
                 FailedPayloadsError,
                 KafkaUnavailableError,
                 LeaderNotAvailableError,
                 NotLeaderForPartitionError,
         ) as e:
             self.log.warning("Got %s, retrying", e.__class__.__name__)
         self.process_consumer_offset_message(message)
     return self.kafka_groups
def comsumer():
    # consumer = SimpleConsumer(client, group=None,
    # topic=topic, partitions=[0, ],
    #                                   auto_commit=False)
    # node_id = list(consumer.client.conns.keys())[0]
    # print dir(consumer.client.conns[node_id])
    # for i in consumer.get_messages(100):
    # 	print i.offset
    # consumer.commit()
    # from pykafka import KafkaClient
    #
    # client = KafkaClient(hosts="127.0.0.1:9092")
    # print client.topics
    # topic1 = client.topics[topic]
    # consumer = topic1.get_simple_consumer(auto_commit_enable=True, )
    # for message in consumer:
    #     if message is not None:
    #         print message.offset, message.value

    connect_str = '127.0.0.1:9092'
    consumer = KafkaConsumer(topic, group_id='my-group', bootstrap_servers=[connect_str],
                             auto_offset_reset='largest',auto_commit_enable=True,
                             auto_commit_interval_messages=1000)# largest,smallest
    consumer.set_topic_partitions((topic, 2, 50032),)  # Optionally specify offsets to start from
    # kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0}) #partition只能被一个消费者消费,所以最好指定消费哪个partitions
    # kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 })
    # print consumer.topics
    for message in consumer:
        print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,
                                             message.offset, message.key,
                                             message.value))
    consumer.commit()
Beispiel #4
0
    def read_groups(self, partition=None):
        self.consumer = KafkaConsumer(
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='earliest',
            enable_auto_commit=False,
            consumer_timeout_ms=30000,
            fetch_max_wait_ms=2000,
            max_partition_fetch_bytes=10 * 1024 * 1024,  # 10MB
        )

        if partition is not None:
            self.active_partitions = {
                partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition),
            }
        else:
            self.active_partitions = {
                p: TopicPartition(CONSUMER_OFFSET_TOPIC, p)
                for p in self.consumer.partitions_for_topic(
                    CONSUMER_OFFSET_TOPIC)
            }
        self.watermarks = self.get_current_watermarks(
            list(self.active_partitions.values()))
        # Active partitions are not empty. Remove the empty ones.
        self.active_partitions = {
            p: tp
            for p, tp in self.active_partitions.items()
            if tp.partition in self.watermarks
            and self.watermarks[tp.partition].highmark > 0 and self.watermarks[
                tp.partition].highmark > self.watermarks[tp.partition].lowmark
        }
        # Cannot consume if there are no active partitions
        if not self.active_partitions:
            return {}

        self.consumer.assign(list(self.active_partitions.values()))
        self.log.info("Consuming from %s", self.active_partitions)

        message_iterator = iter(self.consumer)

        while not self.finished():
            try:
                message = next(message_iterator)
            except StopIteration:
                continue
            # Stop when reaching the last message written to the
            # __consumer_offsets topic when KafkaGroupReader first started
            if message.offset >= self.watermarks[
                    message.partition].highmark - 1:
                self.remove_partition_from_consumer(message.partition)
            self.process_consumer_offset_message(message)

        self._remove_unsubscribed_topics()

        return {
            group: topics.keys()
            for group, topics in six.iteritems(self._kafka_groups) if topics
        }
Beispiel #5
0
def CheckTopicExsited(topic):

    consumer = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers,
                             enable_auto_commit=False,
                             group_id='consumer')

    # Get all partitions by topic
    par = consumer.partitions_for_topic(topic)
    print(par)

    if par is None:
        return False
    return True
    def invoke_kafka_consumer(self, p_str_broker_host, p_is_sasl):
        if p_is_sasl:
            # consumer = Consumer({
            #     'bootstrap.servers': config.BOOTSTRAP_SERVERS,
            #     'group.id': config.CONSUMER_GROUP,
            #     'enable.auto.commit': False,
            # })

            return KafkaConsumer(bootstrap_servers=p_str_broker_host,
                                 security_protocol='SASL_PLAINTEXT',
                                 sasl_mechanism='PLAIN',
                                 sasl_plain_username='******',
                                 sasl_plain_password='******')
        else:
            return KafkaConsumer(bootstrap_servers=p_str_broker_host)
Beispiel #7
0
class IndexedConsumer():
    """
    A simple consumer to retrieve messages from the input queue when it is time to send them
    """
    def __init__(self, input_topic, hosts):
        self.input_topic = input_topic
        self.consumer = KafkaConsumer(bootstrap_servers=hosts)

    def retrieve_event(self, event_reference):
        self.consumer.set_topic_partitions(
            (self.input_topic, event_reference.partition,
             event_reference.offset))
        message = self.consumer.next()
        event = ScheduledEvent.from_dict(json.loads(message.value))
        return event
Beispiel #8
0
def create_consumer(topics, brokers, group,
                    max_bytes=1024 * 1024, max_wait_ms=100):
    kafka = KafkaConsumer(*topics, metadata_broker_list=brokers,
                          group_id=group,
                          fetch_message_max_bytes=max_bytes,
                          fetch_wait_max_ms=max_wait_ms)
    return kafka
def get_kafka_consumer(topic: str, consumer_group: str) -> KafkaConsumer:
    return KafkaConsumer(topic,
                         bootstrap_servers=['localhost:9092'],
                         auto_offset_reset='latest',
                         enable_auto_commit=True,
                         group_id=consumer_group,
                         value_deserializer=lambda x: loads(x.decode('utf-8')))
Beispiel #10
0
def kafka_consumer_entrypoint():
    global g_config_kafka
    print("Kafka config: " + str(g_config_kafka))
    consumer = KafkaConsumer(
        g_config_kafka['topic_name'],
        g_config_kafka['group_id'],
        bootstrap_servers=[g_config_kafka['bootstrap_server']],
        security_protocol="SSL",
        ssl_cafile=g_config_kafka['ssl_cafile'],
        ssl_keyfile=g_config_kafka['ssl_keyfile'],
        ssl_certfile=g_config_kafka['ssl_certfile'])

    for message in consumer:
        # Extract message.value:
        try:
            value = json.loads(message.value.decode('utf-8'))
            # Message is a json dictionary of form:
            # { site_id: ..., status_code: ..., regex_results: ...}
            # print(";; debug: " + str(value))
            db_store_probe_results(value['site_id'], value['status_code'],
                                   value['regex_results'])
        except:
            print("Unable to parse message from the kafka topic.")
            raise
    return True
Beispiel #11
0
 def configure_internal_queues(self):
     """
     configures the internal queues used hold references to events in the input queue
     """
     for i in range(self.number_of_queues):
         client = KafkaClient(hosts=self.kafka_hosts)
         queue_name = SCHEDULER_QUEUE_FORMAT.format(2**i)
         client.ensure_topic_exists(queue_name)
         indexed_consumer = IndexedConsumer(self.input_topic,
                                            self.kafka_hosts)
         queue_consumer = KafkaConsumer(
             queue_name,
             bootstrap_servers=self.kafka_hosts,
             group_id=queue_name,
             consumer_timeout_ms=2000,
             auto_commit_enable=False,
         )
         queue_producer = SimpleProducer(client)
         queue_duration = 2**i
         self.queues.append(
             InternalQueue(
                 queue_consumer,
                 indexed_consumer,
                 queue_producer,
                 self.number_of_queues,
                 queue_duration,
             ))
Beispiel #12
0
 def read_groups(self):
     self.log.info("Kafka consumer running")
     self.consumer = KafkaConsumer(
         CONSUMER_OFFSET_TOPIC,
         group_id='offset_monitoring_consumer',
         bootstrap_servers=self.kafka_config.broker_list,
         auto_offset_reset='smallest',
         auto_commit_enable=False,
         consumer_timeout_ms=10000,
     )
     self.log.info("Consumer ready")
     self.watermarks = self.get_current_watermarks()
     while not self.finished():
         try:
             message = self.consumer.next()
             max_offset = self.get_max_offset(message.partition)
             if message.offset >= max_offset - 1:
                 self.finished_partitions.add(message.partition)
         except ConsumerTimeout:
             break
         except (
                 FailedPayloadsError,
                 KafkaUnavailableError,
                 LeaderNotAvailableError,
                 NotLeaderForPartitionError,
         ) as e:
             self.log.warning("Got %s, retrying", e.__class__.__name__)
         self.process_consumer_offset_message(message)
     return self.kafka_groups
Beispiel #13
0
    def read_groups(self, partition=None):
        self.consumer = KafkaConsumer(
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='earliest',
            enable_auto_commit=False,
            consumer_timeout_ms=30000,
            fetch_max_wait_ms=2000,
            max_partition_fetch_bytes=10 * 1024 * 1024,  # 10MB
        )

        # Fetch metadata as partitions_for_topic only returns locally cached metadata
        # See https://github.com/dpkp/kafka-python/issues/1742
        self.consumer.topics()

        if partition is not None:
            self.active_partitions = {
                partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition),
            }
        else:
            self.active_partitions = {
                p: TopicPartition(CONSUMER_OFFSET_TOPIC, p)
                for p in self.consumer.partitions_for_topic(CONSUMER_OFFSET_TOPIC)
            }
        self.watermarks = self.get_current_watermarks(list(self.active_partitions.values()))
        # Active partitions are not empty. Remove the empty ones.
        self.active_partitions = {
            p: tp for p, tp in self.active_partitions.items()
            if tp.partition in self.watermarks and
            self.watermarks[tp.partition].highmark > 0 and
            self.watermarks[tp.partition].highmark > self.watermarks[tp.partition].lowmark
        }
        # Cannot consume if there are no active partitions
        if not self.active_partitions:
            return {}

        self.consumer.assign(list(self.active_partitions.values()))
        self.log.info("Consuming from %s", self.active_partitions)

        message_iterator = iter(self.consumer)

        while not self.finished():
            try:
                message = next(message_iterator)
            except StopIteration:
                continue
            # Stop when reaching the last message written to the
            # __consumer_offsets topic when KafkaGroupReader first started
            if message.offset >= self.watermarks[message.partition].highmark - 1:
                self.remove_partition_from_consumer(message.partition)
            self.process_consumer_offset_message(message)

        self._remove_unsubscribed_topics()

        return {
            group: topics.keys()
            for group, topics in six.iteritems(self._kafka_groups)
            if topics
        }
Beispiel #14
0
 def __init__(self, topic, addr, auto_commit=False, auto_offset_reset="earliest"):
     """Initializes with Topic Name, Broker Address, and Consumer Settings"""
     self.consumer = KafkaConsumer(topic,
                                     bootstrap_servers=addr,
                                     value_deserializer=lambda m: json.loads(m.decode('ascii')),
                                     enable_auto_commit=auto_commit,
                                     auto_offset_reset=auto_offset_reset,
                                     api_version=(0,1,0))
Beispiel #15
0
 def __init__(self, kafkaHost, kafkaPort, tcpHost, tcpPort, group_id, topic,
              logTopic, interval):
     self.kafkaHost = kafkaHost
     self.kafkaPort = kafkaPort
     self.tcpHost = tcpHost
     self.tcpPort = tcpPort
     self.group_id = group_id
     self.topic = topic
     self.logTopic = logTopic
     self.interval = int(interval)
     self.consumer = KafkaConsumer(
         topic,
         bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)],
         group_id=group_id,
         enable_auto_commit=False)
     self.producer = KafkaProducer(
         bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)])
     self.tcpWriter = None
Beispiel #16
0
 def __init__(self, kafka_host, kafka_port, tcp_host, tcp_port, topic,
              log_topic):
     self.kafka_host = kafka_host
     self.kafka_port = kafka_port
     self.tcp_host = tcp_host
     self.tcp_port = tcp_port
     self.topic = topic
     self.log_topic = log_topic
     self.consumer = KafkaConsumer(
         topic,
         bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)],
         enable_auto_commit=False,
         max_poll_records=1024 * 1024,
         max_partition_fetch_bytes=1024 * 1024 * 100)
     self.producer = KafkaProducer(
         bootstrap_servers=["{}:{}".format(kafka_host, kafka_port)])
     self.connections = {}
     self.sample_end_time = self.get_end_time(time())
     self.lastPolled = []
Beispiel #17
0
def thread_main(topic):
    consumer = KafkaConsumer(topic,
                             group_id='kafka_monitor',
                             metadata_broker_list=broker_list)

    offset = consumer._offsets.fetch

    for part in offset:
        kafka_logsize.labels(topic=part[0],
                             partition=part[1]).set(offset[part])
Beispiel #18
0
def kafka_input(collector, **options):
    group_id = options.pop("group_id", "hackathon")
    broker = options.pop("broker", os.getenv("KAFKA_BROKER", "").split(","))

    consumer = KafkaConsumer(collector, metadata_broker_list=broker,
                             group_id=group_id, auto_commit_enable=False)
    return {
        "collector": collector,
        "files": [KafkaInputBview(consumer, collector), kafka_iter(consumer)],
        "format": kafka_format
    }
Beispiel #19
0
    def _create_kafka_consumer(self):
        consumer = KafkaConsumer(
            self._kafka_topic,
            bootstrap_servers=self._kafka_brokers,
            auto_offset_reset=self._kafka_start_offset,
            # largest #当zookeeper中没有初始的offset时,或者超出offset上限时的处理方式 。
            enable_auto_commit=False,
            ## true时,Consumer会在消费消息后将offset同步到zookeeper,这样当Consumer失败后,新的consumer就能从zookeeper获取最新的offset
            client_id=str(uuid.uuid1()) if id == None else id,
            group_id=self._kafka_group)  # discard old ones

        return consumer
Beispiel #20
0
 def initialize(self, stormconf, context):
     #self.words = itertools.cycle(['dog', 'cat',
     #                              'zebra', 'elephant'])
     #self.sentences = [
     #    "She advised him to take a long holiday, so he immediately quit work and took a trip around the world",
     #    "I was very glad to get a present from her",
     #    "He will be here in half an hour",
     #    "She saw him eating a sandwich",
     #]
     #self.sentences = itertools.cycle(self.sentences)
     self.consumer = KafkaConsumer(b'twitterstream',
                                   bootstrap_servers=['0.0.0.0:9092'])
Beispiel #21
0
def commitTopic(topic, group, partition, commit_offset):
    try:
        print(
            '===================================================================================='
        )
        print('[commitTopic] : topic=' + topic + ', group=' + group +
              ', partition=' + str(partition) + ', commit_offset=' +
              str(commit_offset))
        consumer2 = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers,
                                  enable_auto_commit=False,
                                  group_id=group)
        tp = TopicPartition(topic, partition)

        if int(commit_offset) > 0:

            consumer2.commit({tp: OffsetAndMetadata(commit_offset, None)})

    except Exception as ee:
        print('error when commit Topic')
        print(str(ee))
    finally:
        print('commitTopic end')
Beispiel #22
0
 def configure_input_queue(self):
     """
     configures the input queue that other services can use to schedule an event to be delivered
     """
     client = KafkaClient(hosts=self.kafka_hosts)
     client.ensure_topic_exists(self.input_topic)
     indexed_consumer = IndexedConsumer(self.input_topic, self.kafka_hosts)
     queue_consumer = KafkaConsumer(self.input_topic,
                                    bootstrap_servers=self.kafka_hosts,
                                    group_id=CONSUMER_GROUP)
     queue_producer = SimpleProducer(KafkaClient(hosts=self.kafka_hosts))
     self.queues.append(
         InputQueue(queue_consumer, indexed_consumer, queue_producer,
                    self.number_of_queues))
Beispiel #23
0
 def receive_message(self, cgroup_name):
     consumer = KafkaConsumer(TOPIC,
                              group_id=cgroup_name,
                              bootstrap_servers=[BOOTSTRAP_IP])
     try:
         for msg in consumer:
             msg = msg.value
             logger.info("consumer receive message %s" % msg)
             future = self.thread_pool.submit(self.msg_handler, (msg))
             future.add_done_callback(self.callback_handler)
     except Exception:
         logger.error("consumer error")
         logger.error(traceback.format_exc())
     finally:
         self.thread_pool.shutdown(wait=True)
Beispiel #24
0
    def run(self):
        consumer = KafkaConsumer(bootstrap_servers=self.bootstrap_servers,
                                 auto_offset_reset='earliest',
                                 group_id=self.group,
                                 consumer_timeout_ms=1000)
        consumer.subscribe(self.topics)
        while not self.stop_event.is_set():
            for message in consumer:
                print(message)
                if self.stop_event.is_set():
                    break

        consumer.close()
Beispiel #25
0
def consume(args):
    schema = args.schema
    table = args.table
    assert schema in settings.SCHEMAS, 'schema must in settings.SCHEMAS'
    assert table in settings.TABLES, 'table must in settings.TABLES'
    group_id = f'{schema}.{table}'
    consumer = KafkaConsumer(
        bootstrap_servers=settings.KAFKA_SERVER,
        value_deserializer=lambda x: json.loads(x, object_hook=object_hook),
        key_deserializer=lambda x: x.decode() if x else None,
        enable_auto_commit=False,
        group_id=group_id,
        auto_offset_reset='earliest',
    )
    topic = settings.KAFKA_TOPIC
    partition = settings.PARTITIONS.get(group_id)
    consumer.assign([TopicPartition(topic, partition)])
    event_list = []
    logger.info(
        f'success consume topic:{topic},partition:{partition},schema:{schema},table:{table}'
    )
    pk = reader.get_primary_key(schema, table)
    for msg in consumer:  # type:ConsumerRecord
        logger.debug(f'kafka msg:{msg}')
        event = msg.value
        event_list.append(event)
        len_event = len(event_list)
        if len_event == settings.INSERT_NUMS or (
            (int(time.time() * 10**6) - event_list[0]['event_unixtime']) /
                10**6 >= settings.INSERT_INTERVAL > 0):
            data_dict = {}
            tmp_data = []
            for items in event_list:
                action = items['action']
                action_core = items['action_core']
                data_dict.setdefault(table + schema + action + action_core,
                                     []).append(items)
            for k, v in data_dict.items():
                tmp_data.append(v)
            result = writer.insert_event(tmp_data, settings.SKIP_TYPE,
                                         settings.SKIP_DELETE_TB_NAME, schema,
                                         table, pk)
            if result:
                event_list = []
                consumer.commit()
                logger.info(f'commit success {len_event} events!')
            else:
                logger.error('insert event error!')
                exit()
    def run(self):
        avro_serde = AvroSerDe(AVRO_SCHEMA_STRING)
        client = KafkaClient('localhost:9092')
        consumer = KafkaConsumer(KAFKA_TOPIC,
                                 group_id='my_group',
                                 bootstrap_servers=['localhost:9092'])

        # Keep track of and print statistics.
        attempts = 0
        failures = 0
        failure_rate = 0.0
        for message in consumer:
            event = avro_serde.bytes_to_obj(message.value)
            print '--> ' + str(event)
            if event['op'] == 'login':
                attempts += 1
                if not event['success']: failures += 1
                failure_rate = float(failures) / attempts
            print '--> Event: ' + str(event)
            print '--> Failure Rate: ' + str(failure_rate)
Beispiel #27
0
class KafkaGroupReader:
    def __init__(self, kafka_config):
        self.log = logging.getLogger(__name__)
        self.kafka_config = kafka_config
        self.kafka_groups = defaultdict(set)
        self.finished_partitions = set()

    def read_groups(self):
        self.log.info("Kafka consumer running")
        self.consumer = KafkaConsumer(
            CONSUMER_OFFSET_TOPIC,
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='smallest',
            auto_commit_enable=False,
            consumer_timeout_ms=10000,
        )
        self.log.info("Consumer ready")
        self.watermarks = self.get_current_watermarks()
        while not self.finished():
            try:
                message = self.consumer.next()
                max_offset = self.get_max_offset(message.partition)
                if message.offset >= max_offset - 1:
                    self.finished_partitions.add(message.partition)
            except ConsumerTimeout:
                break
            except (
                    FailedPayloadsError,
                    KafkaUnavailableError,
                    LeaderNotAvailableError,
                    NotLeaderForPartitionError,
            ) as e:
                self.log.warning("Got %s, retrying", e.__class__.__name__)
            self.process_consumer_offset_message(message)
        return self.kafka_groups

    def parse_consumer_offset_message(self, message):
        key = bytearray(message.key)
        ((key_schema, ), cur) = relative_unpack('>h', key, 0)
        if key_schema not in [0, 1]:
            raise InvalidMessageException(
            )  # This is not an offset commit message
        (group, cur) = read_short_string(key, cur)
        (topic, cur) = read_short_string(key, cur)
        ((partition, ), cur) = relative_unpack('>l', key, cur)
        if message.value:
            value = bytearray(message.value)
            ((value_schema, ), cur) = relative_unpack('>h', value, 0)
            if value_schema not in [0, 1]:
                raise InvalidMessageException()  # Unrecognized message value
            ((offset, ), cur) = relative_unpack('>q', value, cur)
        else:
            offset = None  # Offset was deleted
        return str(group), str(topic), partition, offset

    def process_consumer_offset_message(self, message):
        try:
            group, topic, partition, offset = self.parse_consumer_offset_message(
                message)
        except InvalidMessageException:
            return

        if offset:
            self.kafka_groups[group].add(topic)
        else:  # No offset means group deletion
            self.kafka_groups.pop(group, None)

    def get_current_watermarks(self):
        self.consumer._client.load_metadata_for_topics()
        offsets = get_topics_watermarks(
            self.consumer._client,
            [CONSUMER_OFFSET_TOPIC],
        )
        return {
            partition: offset
            for partition, offset in
            offsets[CONSUMER_OFFSET_TOPIC].iteritems()
            if offset.highmark > offset.lowmark
        }

    def get_max_offset(self, partition):
        return self.watermarks[partition].highmark

    def finished(self):
        return len(self.finished_partitions) >= len(self.watermarks)
Beispiel #28
0
    parser = argparse.ArgumentParser()
    parser.add_argument("collector")
    parser.add_argument("--from-beginning", action="store_true")
    parser.add_argument("--ripe-servers", default=",".join(RIPE_SERVERS))
    parser.add_argument("--our-servers", default="localhost:9092")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    start_http_server(4340 + PARTITIONS[args.collector])
    logger.info("loading the stats server on %s", 4340 + PARTITIONS[args.collector])

    consumer = KafkaConsumer("raw-{}".format(args.collector),
                             group_id='test_hackathon10',
                             bootstrap_servers=args.ripe_servers.split(","))

    save_file = "offsets-{}".format(args.collector)
    if args.from_beginning:
        logger.info("starting from scratch")
        offsets = {("raw-{}".format(args.collector), i): 0 for i in range(0, 10)}
        consumer.set_topic_partitions(offsets)
    elif os.path.exists(save_file):
        with open(save_file, "r") as f:
            offsets = cPickle.load(f)
        logger.info("loading offsets from file: %s", offsets)
        consumer.set_topic_partitions(offsets)
    else:
        logger.info("starting from last messages")
Beispiel #29
0
        funcs.append(partial(annotate_if_roa, ro_rad_tree))

    if args.irr_org_file is not None and args.irr_mnt_file:
        relations_dict = dict()
        fill_relation_struct(args.irr_org_file, relations_dict,
                             "organisations")
        fill_relation_struct(args.irr_mnt_file, relations_dict, "maintainers")
        funcs.append(partial(annotate_if_relation, relations_dict))

    if args.as_rel_file is not None and args.ppdc_ases_file is not None and args.as2org_file is not None:
        a, b,c,d = caida_filter_annaunce(args.as_rel_file, args.ppdc_ases_file, args.as2org_file)
        funcs.append(partial(is_legittimate, a, b, c,d))

    if args.from_timestamp is None:
        consumer = KafkaConsumer("conflicts",
                                 metadata_broker_list=args.our_servers.split(","),
                                 group_id="detector",
                                 auto_commit_enable=False)
        offset, = consumer.get_partition_offsets("conflicts", PARTITIONS[args.collector], -1, 1)
        consumer.set_topic_partitions({("conflicts", PARTITIONS[args.collector]): offset - 1})
        last_message = next(iter(consumer))
        last_data = json.loads(last_message.value)
        last_ts = last_data["timestamp"]
        logger.info("last detected event was at offset %s timestamp %s", offset, last_ts)
    else:
        last_ts = args.from_timestamp

    logger.info("detecting conflicts newer than %s", datetime.utcfromtimestamp(last_ts))

    start_http_server(4240 + PARTITIONS[args.collector])

    client = KafkaClient(args.our_servers.split(","))
Beispiel #30
0
class KafkaGroupReader:

    def __init__(self, kafka_config):
        self.log = logging.getLogger(__name__)
        self.kafka_config = kafka_config
        self.kafka_groups = defaultdict(set)
        self.finished_partitions = set()

    def read_groups(self):
        self.log.info("Kafka consumer running")
        self.consumer = KafkaConsumer(
            CONSUMER_OFFSET_TOPIC,
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='smallest',
            auto_commit_enable=False,
            consumer_timeout_ms=10000,
        )
        self.log.info("Consumer ready")
        self.watermarks = self.get_current_watermarks()
        while not self.finished():
            try:
                message = self.consumer.next()
                max_offset = self.get_max_offset(message.partition)
                if message.offset >= max_offset - 1:
                    self.finished_partitions.add(message.partition)
            except ConsumerTimeout:
                break
            except (
                    FailedPayloadsError,
                    KafkaUnavailableError,
                    LeaderNotAvailableError,
                    NotLeaderForPartitionError,
            ) as e:
                self.log.warning("Got %s, retrying", e.__class__.__name__)
            self.process_consumer_offset_message(message)
        return self.kafka_groups

    def parse_consumer_offset_message(self, message):
        key = bytearray(message.key)
        ((key_schema,), cur) = relative_unpack('>h', key, 0)
        if key_schema not in [0, 1]:
            raise InvalidMessageException()   # This is not an offset commit message
        (group, cur) = read_short_string(key, cur)
        (topic, cur) = read_short_string(key, cur)
        ((partition,), cur) = relative_unpack('>l', key, cur)
        if message.value:
            value = bytearray(message.value)
            ((value_schema,), cur) = relative_unpack('>h', value, 0)
            if value_schema not in [0, 1]:
                raise InvalidMessageException()  # Unrecognized message value
            ((offset,), cur) = relative_unpack('>q', value, cur)
        else:
            offset = None  # Offset was deleted
        return str(group), str(topic), partition, offset

    def process_consumer_offset_message(self, message):
        try:
            group, topic, partition, offset = self.parse_consumer_offset_message(message)
        except InvalidMessageException:
            return

        if offset:
            self.kafka_groups[group].add(topic)
        else:  # No offset means group deletion
            self.kafka_groups.pop(group, None)

    def get_current_watermarks(self):
        self.consumer._client.load_metadata_for_topics()
        offsets = get_topics_watermarks(
            self.consumer._client,
            [CONSUMER_OFFSET_TOPIC],
        )
        return {partition: offset for partition, offset
                in offsets[CONSUMER_OFFSET_TOPIC].iteritems()
                if offset.highmark > offset.lowmark}

    def get_max_offset(self, partition):
        return self.watermarks[partition].highmark

    def finished(self):
        return len(self.finished_partitions) >= len(self.watermarks)
Beispiel #31
0
    import argparse

    relations, childs, parents = caida_filter_annaunce(
        "20160101.as-rel.txt", "20160101.ppdc-ases.txt")

    print(len(relations), len(childs), len(parents))
    parser = argparse.ArgumentParser(
        description="get a feed of abnormal BGP conflicts")
    parser.add_argument("--offset", type=int)

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    consumer = KafkaConsumer("hijacks",
                             bootstrap_servers=["comet-17-08.sdsc.edu:9092"],
                             group_id="client")
    if args.offset is not None:
        topics = [("hijacks", i, args.offset) for i in PARTITIONS.values()]
        consumer.set_topic_partitions(*topics)

    hijacks = 0
    total = 0
    for item in consumer:
        total += 1
        if (is_legittimate(relations, childs, parents,
                           json.loads(item.value)) == 0):
            hijacks += 1
            #print(item.value)

        if (total == 10000): print(total, hijacks)
Beispiel #32
0
from kafka.client import KafkaClient
from kafka.consumer import KafkaConsumer
from kafka.producer import SimpleProducer

import numpy as np
from sklearn import svm
from sklearn.externals import joblib

import mysql.connector
from datetime import datetime

import json


client = KafkaClient("ip-172-31-28-55.ec2.internal:6667")
consumer = KafkaConsumer("shm", metadata_broker_list=['ip-172-31-28-55.ec2.internal:6667'])
#consumer = KafkaConsumer("shm", metadata_broker_list=['ip-172-31-28-55.ec2.internal:6667'])

conn = mysql.connector.connect(user='******', password='******',
                              host='iotshm-data.ck3sx5qm0blx.us-west-2.rds.amazonaws.com',
                              database='iotshm')

cursor = conn.cursor()

#add_health = ("""INSERT IGNORE INTO iotshm.Health (sensor_id, timestamp, reading_type, healthy) VALUES (%s, %s, %s, %s)""")
add_magnitude = ("""INSERT IGNORE INTO iotshm.Magnitude (frequency, sensor_id, magnitude, reading_type, timestamp, healthy) VALUES(%s, %s, %s, %s, %s, %s)""")


# TODO add new classifier files and change file names
x_clf = joblib.load('xClf.pkl')
y_clf = joblib.load('xClf.pkl')
Beispiel #33
0
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Basics').getOrCreate()

# import pyspark class Row from module sql
from pyspark.sql import *
from pyspark.sql.types import *
import tempfile

# ml
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel

# start a kafka consumer session
from kafka.consumer import KafkaConsumer
consumer = KafkaConsumer(
    "titanic",
    bootstrap_servers=['ip-172-31-12-218.us-east-2.compute.internal:6667'])
print('consumer launched')

testSchema = [
    "PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket",
    "Fare", "Cabin", "Embarked"
]

pipeline = Pipeline.load("/home/ubuntu/titanic/pipeline")
model = PipelineModel.load("/home/ubuntu/titanic/model")


def getTrain(msg):
    # put passenger info into dataframe
    # print msg
Beispiel #34
0
    broker = SimpleClient(kafka)
    lags = {}
    zk = KazooClient(hosts=zookeepers, read_only=True)
    zk.start()
    logsize = 0
    #    topics=zk.get_children("/consumers/%s/owners" % (group) )
    topic = sys.argv[1]
    data_need = sys.argv[2]
    #    for topic in topics:
    if topic:
        logsize = 0
        #	print topic
        partitions = broker.get_partition_ids_for_topic(topic)
        #	print partitions
        consumer = KafkaConsumer(broker, group, str(topic))
        responses = broker.send_offset_fetch_request(
            group, [OffsetFetchRequestPayload(topic, p) for p in partitions],
            fail_on_error=True)
        #	print responses
        latest_offset = 0
        for res in responses:
            if topic != "test":
                latest_offset += res[2]
            #	print latest_offset
        for partition in partitions:
            log = "/consumers/%s/offsets/%s/%s" % (group, topic, partition)
            if zk.exists(log):
                data, stat = zk.get(log)
                logsize += int(data)
            #	print logsize
Beispiel #35
0
class KafkaGroupReader:

    def __init__(self, kafka_config):
        self.log = logging.getLogger(__name__)
        self.kafka_config = kafka_config
        self._kafka_groups = defaultdict(lambda: defaultdict(dict))
        self.active_partitions = {}
        self._finished = False

    def read_group(self, group_id):
        partition_count = get_offset_topic_partition_count(self.kafka_config)
        partition = get_group_partition(group_id, partition_count)
        return self.read_groups(partition).get(group_id, [])

    def read_groups(self, partition=None):
        self.consumer = KafkaConsumer(
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='earliest',
            enable_auto_commit=False,
            consumer_timeout_ms=30000,
            fetch_max_wait_ms=2000,
            max_partition_fetch_bytes=10 * 1024 * 1024,  # 10MB
        )

        # Fetch metadata as partitions_for_topic only returns locally cached metadata
        # See https://github.com/dpkp/kafka-python/issues/1742
        self.consumer.topics()

        if partition is not None:
            self.active_partitions = {
                partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition),
            }
        else:
            self.active_partitions = {
                p: TopicPartition(CONSUMER_OFFSET_TOPIC, p)
                for p in self.consumer.partitions_for_topic(CONSUMER_OFFSET_TOPIC)
            }
        self.watermarks = self.get_current_watermarks(list(self.active_partitions.values()))
        # Active partitions are not empty. Remove the empty ones.
        self.active_partitions = {
            p: tp for p, tp in self.active_partitions.items()
            if tp.partition in self.watermarks and
            self.watermarks[tp.partition].highmark > 0 and
            self.watermarks[tp.partition].highmark > self.watermarks[tp.partition].lowmark
        }
        # Cannot consume if there are no active partitions
        if not self.active_partitions:
            return {}

        self.consumer.assign(list(self.active_partitions.values()))
        self.log.info("Consuming from %s", self.active_partitions)

        message_iterator = iter(self.consumer)

        while not self.finished():
            try:
                message = next(message_iterator)
            except StopIteration:
                continue
            # Stop when reaching the last message written to the
            # __consumer_offsets topic when KafkaGroupReader first started
            if message.offset >= self.watermarks[message.partition].highmark - 1:
                self.remove_partition_from_consumer(message.partition)
            self.process_consumer_offset_message(message)

        self._remove_unsubscribed_topics()

        return {
            group: topics.keys()
            for group, topics in six.iteritems(self._kafka_groups)
            if topics
        }

    def _remove_unsubscribed_topics(self):
        for group, topics in list(six.iteritems(self._kafka_groups)):
            for topic, partitions in list(six.iteritems(topics)):
                # If offsets for all partitions are 0, consider the topic as unsubscribed
                if not any(partitions.values()):
                    del self._kafka_groups[group][topic]
                    self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic))

    def remove_partition_from_consumer(self, partition):
        deleted = self.active_partitions.pop(partition)
        # Terminate if there are no more partitions to consume
        if not self.active_partitions:
            self.log.info("Completed reading from all partitions")
            self._finished = True
            return
        # Reassign the remaining partitions to the consumer while saving the
        # position
        positions = [
            (p, self.consumer.position(p))
            for p in self.active_partitions.values()
        ]
        self.consumer.assign(list(self.active_partitions.values()))
        for topic_partition, position in positions:
            self.consumer.seek(topic_partition, position)
        self.log.info(
            "Completed reading from %s. Remaining partitions: %s",
            deleted,
            self.active_partitions,
        )

    def parse_consumer_offset_message(self, message):
        key = message.key
        ((key_schema,), cur) = relative_unpack(b'>h', key, 0)
        if key_schema not in [0, 1]:
            raise InvalidMessageException()   # This is not an offset commit message
        (group, cur) = read_short_string(key, cur)
        (topic, cur) = read_short_string(key, cur)
        ((partition,), cur) = relative_unpack(b'>l', key, cur)
        if message.value:
            value = message.value
            ((value_schema,), cur) = relative_unpack(b'>h', value, 0)
            if value_schema not in [0, 1]:
                raise InvalidMessageException()  # Unrecognized message value
            ((offset,), cur) = relative_unpack(b'>q', value, cur)
        else:
            offset = None  # Offset was deleted
        return group.decode(), topic.decode(), partition, offset

    def process_consumer_offset_message(self, message):
        try:
            group, topic, partition, offset = self.parse_consumer_offset_message(message)
        except InvalidMessageException:
            return

        if offset is not None:
            self._kafka_groups[group][topic][partition] = offset
            self.log.info(
                "Updated group {group} topic {topic} and updated offset in list of groups".format(
                    group=group,
                    topic=topic,
                ),
            )
        # TODO: check if we can ever find an offset commit message with message.value is None
        elif offset is None and group in self._kafka_groups and \
                topic in self._kafka_groups[group]:  # No offset means topic deletion
            del self._kafka_groups[group][topic]
            self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic))

    def get_current_watermarks(self, partitions=None):
        client = KafkaToolClient(self.kafka_config.broker_list)
        client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC)
        offsets = get_topics_watermarks(
            client,
            [CONSUMER_OFFSET_TOPIC],
        )
        partitions_set = set(tp.partition for tp in partitions) if partitions else None
        return {part: offset for part, offset
                in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC])
                if offset.highmark > offset.lowmark and
                (partitions is None or part in partitions_set)}

    def finished(self):
        return self._finished
Beispiel #36
0
def consume(args):
    schema = args.schema
    tables = args.tables
    skip_error = args.skip_error
    assert schema in settings.SCHEMAS, f'schema {schema} must in settings.SCHEMAS'
    topic = settings.KAFKA_TOPIC
    tables_pk = {}
    partitions = []
    for table in tables.split(','):
        assert table in settings.TABLES, f'table {table} must in settings.TABLES'

        partition = settings.PARTITIONS.get(f'{schema}.{table}')
        tp = TopicPartition(topic, partition)
        partitions.append(tp)
        tables_pk[table] = reader.get_primary_key(schema, table)

    group_id = f'{schema}.{tables}'
    consumer = KafkaConsumer(
        bootstrap_servers=settings.KAFKA_SERVER,
        value_deserializer=lambda x: json.loads(x, object_hook=object_hook),
        key_deserializer=lambda x: x.decode() if x else None,
        enable_auto_commit=False,
        group_id=group_id,
        auto_offset_reset='earliest',
    )
    consumer.assign(partitions)

    event_list = {}
    is_insert = False
    last_time = 0
    len_event = 0
    logger.info(f'success consume topic:{topic},partitions:{partitions},schema:{schema},tables:{tables}')

    for msg in consumer:  # type:ConsumerRecord
        logger.debug(f'kafka msg:{msg}')
        event = msg.value
        event_unixtime = event['event_unixtime'] / 10 ** 6
        table = event['table']
        schema = event['schema']
        event_list.setdefault(table, []).append(event)
        len_event += 1

        if last_time == 0:
            last_time = event_unixtime

        if len_event == settings.INSERT_NUMS:
            is_insert = True
        else:
            if event_unixtime - last_time >= settings.INSERT_INTERVAL > 0:
                is_insert = True
        if is_insert:
            data_dict = {}
            events_num = 0
            for table, items in event_list.items():
                for item in items:
                    action = item['action']
                    action_core = item['action_core']
                    data_dict.setdefault(table, {}).setdefault(table + schema + action + action_core, []).append(item)
            for table, v in data_dict.items():
                tmp_data = []
                for k1, v1 in v.items():
                    events_num += len(v1)
                    tmp_data.append(v1)
                try:
                    result = writer.insert_event(tmp_data, schema, table, tables_pk.get(table))
                    if not result:
                        logger.error('insert event error!')
                        if not skip_error:
                            exit()
                except Exception as e:
                    logger.error(f'insert event error!,error:{e}')
                    if not skip_error:
                        exit()
            consumer.commit()
            logger.info(f'commit success {events_num} events!')
            event_list = {}
            is_insert = False
            len_event = last_time = 0
    parser.add_argument("--our-servers", default=",".join(["comet-17-22.sdsc.edu:9092"]))
    parser.add_argument("--as-rel-file",
                        help="TXT file containing AS relation")
    parser.add_argument("--ppdc-ases-file")

    args = parser.parse_args()

    collectors = COLLECTORS if len(args.collector) == 0 else args.collector

    logging.basicConfig(level=logging.INFO)

    topics = ["rib-{}".format(c) for c in collectors]
    logger.info("using topics %s", topics)

    consumer = KafkaConsumer(*topics,
                             bootstrap_servers=args.our_servers.split(","),
                             group_id="follower")

    if args.offset is not None:
        consumer.set_topic_partitions({(t, 0): args.offset for t in topics})

    # setup filters
    filters = []

    if args.anycast_file is not None:
        anycast = Radix()
        count = 0
        with open(args.anycast_file, "r") as f:
            for prefix in f:
                if not prefix.startswith("#"):
                    anycast.add(prefix.strip())
def main():
    logger = logging.getLogger(os.path.basename(__file__))

    # Setup Aiven SDK
    logger.info("Setting up Aiven SDK")
    client = AivenClient("https://api.aiven.io")
    client.set_auth_token(os.environ["AIVEN_TOKEN"])

    # Lookup the target service
    logger.info("Looking up the target Aiven Kafka Service")
    service = client.get_service(project=os.environ["AIVEN_PROJECT"],
                                 service=os.environ["AIVEN_SERVICE"])
    if not service:
        raise SystemExit("Failed to look up the target service")

    # Store credentials on disk. This is using the main access certificates (avnadmin).
    logger.info("Storing Aiven service access credentials")
    with open("client.crt", "w") as fh:
        fh.write(service["connection_info"]["kafka_access_cert"])
    with open("client.key", "w") as fh:
        fh.write(service["connection_info"]["kafka_access_key"])

    # Project CA certificate
    logger.info("Fetching project CA certificate")
    result = client.get_project_ca(project=os.environ["AIVEN_PROJECT"])
    with open("ca.crt", "w") as fh:
        fh.write(result["certificate"])

    # Initialize Kafka client
    kafka_client = KafkaConsumer(
        bootstrap_servers=service["service_uri"],
        security_protocol="SSL",
        ssl_cafile="ca.crt",
        ssl_certfile="client.crt",
        ssl_keyfile="client.key",
    )

    partitions = kafka_client.partitions_for_topic(os.environ["AIVEN_TOPIC"])
    tps = [
        TopicPartition(os.environ["AIVEN_TOPIC"], partition)
        for partition in partitions
    ]
    last_timestamp = time.monotonic()
    last_offsets = {}

    logger.info("Start result collection loop, break with CTRL-C")
    readings = []
    while True:
        delta = 0
        result = kafka_client.end_offsets(tps)
        timenow = time.monotonic()
        for tp, offset in result.items():
            if tp in last_offsets:
                delta += offset - last_offsets[tp]
            last_offsets[tp] = offset

        messages_per_second = int(delta / (timenow - last_timestamp))

        readings.append(messages_per_second)
        readings = readings[-30:]

        logger.info("%d messages/s, 30 sample average %d messages/s",
                    messages_per_second,
                    sum(readings) / len(readings))
        last_timestamp = timenow
        time.sleep(2)
Beispiel #39
0
if __name__ == "__main__":
    import argparse

    relations,childs,parents=caida_filter_annaunce("20160101.as-rel.txt","20160101.ppdc-ases.txt")
 
    print(len(relations),len(childs),len(parents))
    parser = argparse.ArgumentParser(description="get a feed of abnormal BGP conflicts")
    parser.add_argument("--offset", type=int)

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    consumer = KafkaConsumer("hijacks",
                             bootstrap_servers=["comet-17-08.sdsc.edu:9092"],
                             group_id="client")
    if args.offset is not None:
        topics = [("hijacks", i, args.offset) for i in PARTITIONS.values()]
        consumer.set_topic_partitions(*topics)

    hijacks=0
    total=0
    for item in consumer:
        total+=1     
        if(is_legittimate(relations,childs,parents, json.loads(item.value))==0): 
            hijacks+=1 
            #print(item.value)
            
        if(total==10000): print(total,hijacks)
Beispiel #40
0
from kafka.consumer import KafkaConsumer
from json import loads
from mongoengine import *

from matilda.data_pipeline import object_model

consumer = KafkaConsumer(
    'numtest',  # kafka topic
    bootstrap_servers=['localhost:9092'],  # same as our producer
    # It handles where the consumer restarts reading after breaking down or being turned off and can be set either
    # to earliest or latest. When set to latest, the consumer starts reading at the end of the log.
    # When set to earliest, the consumer starts reading at the latest committed offset.
    auto_offset_reset='earliest',
    enable_auto_commit=
    True,  # makes sure the consumer commits its read offset every interval.
    # join a consumer group for dynamic partition assignment and offset commits
    # a consumer needs to be part of a consumer group to make the auto commit work.
    # otherwise, need to do it manually i.e. consumer.assign([TopicPartition('foobar', 2)]); msg = next(consumer)
    group_id='my-group',
    # deserialize encoded values
    value_deserializer=lambda x: loads(x.decode('utf-8')))


def get_atlas_db_url(username, password, dbname):
    return f"mongodb+srv://{username}:{password}@cluster0.ptrie.mongodb.net/{dbname}?retryWrites=true&w=majority&" \
           f"ssl=true"


atlas_url = get_atlas_db_url(username='******',
                             password='******',
                             dbname='matilda-db')