Exemple #1
0
class Kafka(object):
    executor = ThreadPoolExecutor(20)

    def __init__(self, broker):
        self.broker = broker
        self.client = KafkaClient(broker, timeout=3)

    @run_on_executor
    def getPartition(self, topic):
        """ 指定topic返回partition列表 """

        return self.client.get_partition_ids_for_topic(topic)

    @run_on_executor
    def getLogsize(self, topic, partitions):
        """ 指定topic与partition列表, 返回logsize数据 """

        tp = self.client.send_offset_request(
            [OffsetRequestPayload(topic, p, -1, 1) for p in partitions])
        return {p.partition: p.offsets[0] for p in tp}

    @run_on_executor
    def getOffsets(self, topic, partitions, group):
        """ 指定topic、partition和group, 返回offsets数据 """

        try:
            # 尝试使用zookeeper-storage api获取offsets数据
            # 未获得指定group的offsets数据将抛出UnknownTopicOrPartitionError异常
            tp = self.client.send_offset_fetch_request(
                group,
                [OffsetRequestPayload(topic, p, -1, 1) for p in partitions])
            offsets = {p.partition: p.offset for p in tp}

        except UnknownTopicOrPartitionError:
            # 收到异常后使用kafka-storage api获取offsets数据
            consumer = KafkaConsumer(group_id=group,
                                     bootstrap_servers=self.broker,
                                     enable_auto_commit=False)
            tp = [TopicPartition(topic, p) for p in partitions]
            consumer.assign(tp)
            offsets = {p.partition: consumer.position(p) for p in tp}

        return offsets
Exemple #2
0
    def handler(self):
        """ 查询指定Kafka集群Topic中每个Partition当前Logsize, 将Logsize写入LevelDB
            每次收集Logsize数据后会检测retention_day参数,删除过期数据
        """

        clusters = base.config["collector"]["clusters"]

        for cluster, metric in clusters.items():
            client = KafkaClient(metric["brokers"], timeout=3)

            for topic in metric["topics"]:
                partitions = client.get_partition_ids_for_topic(topic)
                payload = [
                    OffsetRequestPayload(topic, p, -1, 1) for p in partitions
                ]
                logsize = {
                    p.partition: p.offsets[0]
                    for p in client.send_offset_request(payload)
                }

                if logsize:
                    key = str(int(time.time())).encode("utf-8")
                    value = json.dumps(logsize).encode("utf-8")

                    db = base.init_leveldb(cluster=cluster, topic=topic)
                    db.Put(key, value)
                    deadline = base.config["collector"]["clusters"][cluster][
                        "retention_hour"] * 3600

                    for key, _ in db.RangeIter():
                        if time.time() - int(key) > deadline:
                            db.Delete(key)
                        else:
                            break

            client.close()
    def __init__(self, settings):
        super(HHStrategyWorker, self).__init__(settings, topic)
        self.slot = Slot(log_processing=self.work,
                         incoming=self.incoming,
                         outgoing=self.outgoing,
                         is_master=settings.get("FRONTERA_MASTER"))
        kafka_hh = KafkaClient(settings.get('KAFKA_LOCATION_HH'))
        self.consumer_hh = SimpleConsumer(
            kafka_hh,
            settings.get('FRONTERA_GROUP'),
            settings.get('FRONTERA_INCOMING_TOPIC'),
            buffer_size=262144,
            max_buffer_size=10485760,
            auto_commit_every_n=1)
        self.producer_hh = SimpleProducer(kafka_hh)
        self.results_topic = settings.get("FRONTERA_RESULTS_TOPIC")
        self.job_config = {}
        self.zookeeper = ZookeeperSession(settings.get('ZOOKEEPER_LOCATION'),
                                          name_prefix=self.worker_prefix)

        kafka = KafkaClient(settings.get('KAFKA_LOCATION'))
        self.partitions_count = len(
            kafka.get_partition_ids_for_topic(settings.get('INCOMING_TOPIC')))
        self.null_cycles = 0
Exemple #4
0
def spoorer(self):
    try:
        kafka_client = KafkaClient(self.kafka_hosts, timeout=self.timeout)
    except Exception as e:
        print "Error, cannot connect kafka broker."
        sys.exit(1)
    else:
        kafka_topics = kafka_client.topics
    finally:
        kafka_client.close()

    try:
        zookeeper_client = KazooClient(hosts=self.zookeeper_hosts, read_only=True, timeout=self.timeout)
        zookeeper_client.start()
    except Exception as e:
        print "Error, cannot connect zookeeper server."
        sys.exit(1)

    try:
        groups = map(str,zookeeper_client.get_children(self.zookeeper_url + 'consumers'))
    except NoNodeError as e:
        print "Error, invalid zookeeper url."
        zookeeper_client.stop()
        sys.exit(2)
    else:
        for group in groups:
            if 'offsets' not in zookeeper_client.get_children(self.zookeeper_url + 'consumers/%s' % group): continue
            topic_path = 'consumers/%s/offsets' % (group)
            topics = map(str,zookeeper_client.get_children(self.zookeeper_url + topic_path))
            if len(topics) == 0: continue

            for topic in topics:
                if topic not in self.white_topic_group.keys():
                    continue
                elif group not in self.white_topic_group[topic].replace(' ','').split(','):
                    continue
                partition_path = 'consumers/%s/offsets/%s' % (group,topic)
                partitions = map(int,zookeeper_client.get_children(self.zookeeper_url + partition_path))

                for partition in partitions:
                    base_path = 'consumers/%s/%s/%s/%s' % (group, '%s', topic, partition)
                    owner_path, offset_path = base_path % 'owners', base_path % 'offsets'
                    offset = zookeeper_client.get(self.zookeeper_url + offset_path)[0]

                    try:
                        owner = zookeeper_client.get(self.zookeeper_url + owner_path)[0]
                    except NoNodeError as e:
                        owner = 'null'

                    metric = {'datetime':time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'topic':topic, 'group':group, 'partition':int(partition), 'logsize':None, 'offset':int(offset), 'lag':None, 'owner':owner}
                    self.result.append(metric)
    finally:
        zookeeper_client.stop()

    try:
        kafka_consumer = KafkaConsumer(bootstrap_servers=self.kafka_hosts)
    except Exception as e:
        print "Error, cannot connect kafka broker."
        sys.exit(1)
    else:
        for kafka_topic in kafka_topics:
            self.kafka_logsize[kafka_topic] = {}
            partitions = kafka_client.get_partition_ids_for_topic(kafka_topic)

            for partition in partitions:
                offset = kafka_consumer.get_partition_offsets(kafka_topic, partition, -1, 1)[0]
                self.kafka_logsize[kafka_topic][partition] = offset

        with open(self.log_file,'w') as f1, open(self.log_day_file,'a') as f2:

            for metric in self.result:
                logsize = self.kafka_logsize[metric['topic']][metric['partition']]
                metric['logsize'] = int(logsize)
                metric['lag'] = int(logsize) - int(metric['offset'])

                f1.write(json.dumps(metric,sort_keys=True) + '\n')
                f1.flush()
                f2.write(json.dumps(metric,sort_keys=True) + '\n')
                f2.flush()
    finally:
        kafka_consumer.close()

    return ''
Exemple #5
0
Usage:  k-debug.py <host>

"""

from kafka import SimpleProducer, KafkaClient

import logging
import sys

logging.basicConfig()

kafka = KafkaClient(sys.argv[1] + ':9092')
#producer = SimpleProducer(kafka)

kafka.ensure_topic_exists(b'picasso-stackato-logs')

print("Client:  {0!r}".format(kafka))
md = kafka.send_metadata_request()
print("  {0!r}".format(md))

for t in kafka.topics:
    print("{0!r}:".format(t))
    print("  partitions: {0!r}:".format(kafka.get_partition_ids_for_topic(t)))
    
#kafka.ensure_topic_exists(b'my-topic')


print("done.")

## end
Exemple #6
0
def main():
    # R0915: "too many statements in function (>50)"
    # pylint: disable=R0915

    if len(sys.argv) != 8:
        print "Wrong number of arguments"
        usage()

    (kafka_topic, kafka_broker, mysql_host, mysql_port, mysql_user,
     mysql_password, mysql_table) = sys.argv[1:8]

    sql_db = MySQLdb.connect(host=mysql_host,
                             port=int(mysql_port),
                             user=mysql_user,
                             passwd=mysql_password)

    query = sql_db.cursor()

    client = KafkaClient(kafka_broker)

    consumer = KafkaConsumer(kafka_topic,
                             metadata_broker_list=[kafka_broker],
                             auto_commit_enable=False,
                             auto_offset_reset='smallest')

    last_offsets = {}
    partition_ids = client.get_partition_ids_for_topic(kafka_topic)
    for partition in partition_ids:
        offsets = consumer.get_partition_offsets(kafka_topic, partition, -1, 1)
        print offsets

        # Don't really understand this format, so put in asserts
        # (Pdb) consumer.get_partition_offsets("topicname", 0, -1, 1)
        # (15471)
        assert len(offsets) == 1
        assert offsets[0] > 0

        next_offset = offsets[0]
        last_offset = next_offset - 1
        last_offsets[partition] = last_offset

    finished_partitions = set()

    print last_offsets
    count = 0

    # mapping from primary key tuples, to row data
    insert_batch = {}
    insert_sql = None

    for m in consumer:
        if m.partition in finished_partitions:
            continue

        count += 1

        payload = m.value
        (first_line, rest) = payload.split("\r\n", 1)
        (_notused, header_len, _body_len) = first_line.split(" ")
        header_len = int(header_len)
        body = rest[header_len:]

        primary_key_str = m.key
        #            import pdb; pdb.set_trace()
        primary_keys = json.loads(primary_key_str)
        primary_tuples = sorted(primary_keys.items())
        sorted_primary_key_names = [k for (k, v) in primary_tuples]
        sorted_primary_key_values = [int(v) for (k, v) in primary_tuples]

        if len(body) > 0:
            # This is a write
            data = json.loads(body)

            # date fields have to be turned from a number back into a datetime object
            date_fields = ['createDate', 'updateDate']
            for d in date_fields:
                if d not in data:
                    continue
                val = data[d]
                if val is None:
                    continue
                if val == -62170156800000:
                    # this is hacky and a sign that i'm doing something wrong, I think.
                    val = "0000-00-00 00:00:00"
                else:
                    val = val / 1000
                    import datetime
                    val = datetime.datetime.utcfromtimestamp(val)
                data[d] = val

            keys = [k for (k, v) in sorted(data.items())]
            values = [v for (k, v) in sorted(data.items())]

            keys_wo_primary = [k for (k, v) in sorted(data.items())]
            for p in sorted_primary_key_names:
                keys_wo_primary.remove(p)

            # e.g.
            # insert into dbname.tablename (col1, col2) values (%s, %s) on duplicate key update col2 = values(col2)
            # assuming that col1 is the primary key
            insert_sql = """insert into %s """ % mysql_table
            insert_sql += """ (%s) """ % (", ".join(keys))
            insert_sql += " values (%s) " % (", ".join(["%s"] * len(values)))
            insert_sql += "on duplicate key update "
            insert_sql += ", ".join(
                ["%s = values(%s)" % (k, k) for k in keys_wo_primary])
            insert_batch[tuple(primary_tuples)] = tuple(values)
            if len(insert_batch) > 5000:
                query.executemany(insert_sql, insert_batch.values())
                sql_db.commit()
                insert_batch = {}
        else:
            # This is a delete
            if len(insert_batch) > 0 and insert_sql is not None:
                # flush all writes before processing any deletes
                query.executemany(insert_sql, insert_batch.values())
                sql_db.commit()
                insert_batch = {}

            # get the primary keys, and delete the row
            where_clause = ' and '.join(
                ["%s = %%s" % k for k in sorted_primary_key_names])
            # e.g.
            # delete from dbname.tablename where field1 = %s and field2 = %s
            delete_sql = """delete from %s where %s""" % (mysql_table,
                                                          where_clause)
            values = tuple(sorted_primary_key_values)
            query.execute(delete_sql, values)
            sql_db.commit()

        # how do I know when to stop?
        print "Partition %d Offset %d of %d" % (m.partition, m.offset,
                                                last_offsets.get(m.partition))
        if m.offset >= last_offsets.get(m.partition):
            finished_partitions.add(m.partition)
            if len(finished_partitions) == len(last_offsets):
                # All partitions are done.
                break

    if len(insert_batch) > 0:
        # flush any remaining writes
        query.executemany(insert_sql, insert_batch.values())
        sql_db.commit()
        insert_batch = {}

    print "Imported %d messages into mysql" % count
Exemple #7
0
import sys
import getopt
import json
from pprint import pformat

from confluent_kafka import Consumer, KafkaException
from kafka import KafkaClient

try:
    client = KafkaClient(
        ['10.156.0.3:6667', '10.156.0.4:6667', '10.156.0.5:6667'])
    topic_partition_ids = client.get_partition_ids_for_topic(
        'mles.announcements')
    x = len(topic_partition_ids)
    client.close()
except Exception as ex:
    print("client:: error:")
    print(ex)

finally:
    client.close()

# Consumer configuration
conf = {
    'bootstrap.servers':
    ['10.156.0.3:6667', '10.156.0.4:6667', '10.156.0.5:6667'],
    'group.id': 'sschokorov'
}

# Create Consumer instance
xs = [2 * i for i in range(x)]
def main():
    # R0915: "too many statements in function (>50)"
    # pylint: disable=R0915

    if len(sys.argv) != 8:
        print "Wrong number of arguments"
        usage()

    (kafka_topic, kafka_broker, mysql_host, mysql_port, mysql_user, mysql_password, mysql_table) = sys.argv[1:8]

    sql_db = MySQLdb.connect(
        host = mysql_host,
        port = int(mysql_port),
        user = mysql_user,
        passwd = mysql_password)

    query = sql_db.cursor()

    client = KafkaClient(kafka_broker)
    


    consumer = KafkaConsumer(kafka_topic, metadata_broker_list = [kafka_broker],
                             auto_commit_enable = False,
                             auto_offset_reset='smallest')

    last_offsets = {}
    partition_ids = client.get_partition_ids_for_topic(kafka_topic)
    for partition in partition_ids:
        offsets = consumer.get_partition_offsets(kafka_topic, partition, -1, 1)
        print offsets
        
        # Don't really understand this format, so put in asserts
        # (Pdb) consumer.get_partition_offsets("appdb.bclab1.tivo.com", 0, -1, 1)
        # (15471)
        assert len(offsets) == 1
        assert offsets[0] > 0

        next_offset = offsets[0]
        last_offset = next_offset - 1
        last_offsets[partition] = last_offset

    finished_partitions = set()

    print last_offsets
    count = 0

    # mapping from primary key tuples, to row data
    insert_batch = {}
    insert_sql = None

    for m in consumer:
        if m.partition in finished_partitions:
            continue

        count += 1

        payload = m.value
        (first_line, rest) = payload.split("\r\n", 1)
        (_notused, header_len, _body_len) = first_line.split(" ")
        header_len = int(header_len)
        body = rest[header_len:]

        primary_key_str = m.key
        #            import pdb; pdb.set_trace()
        primary_keys = json.loads(primary_key_str)
        primary_tuples = sorted(primary_keys.items())
        sorted_primary_key_names = [ k for (k,v) in primary_tuples ]
        sorted_primary_key_values = [ int(v) for (k,v) in primary_tuples ]

        if len(body) > 0:
            # This is a write
            data = json.loads(body)
                
            # date fields have to be turned from a number back into a datetime object
            date_fields = ['createDate', 'endTime', 'expectedDeletion', 'startTime', 'updateDate', 
                           'availabilityWindowStart', 'availabilityWindowEnd', 
                           'entitlementWindowStart', 'entitlementWindowEnd']
            for d in date_fields:
                if d not in data:
                    continue
                val = data[d]
                if val is None:
                    continue
                if val == -62170156800000:
                    # this is hacky and a sign that i'm doing something wrong, I think.
                    val = "0000-00-00 00:00:00"
                else:
                    val = val/1000
                    import datetime; 
                    val = datetime.datetime.utcfromtimestamp(val)
                data[d] = val

            keys = [ k for (k, v) in sorted(data.items()) ]
            values = [ v for (k, v) in sorted(data.items()) ]

            keys_wo_primary = [ k for (k, v) in sorted(data.items()) ]
            for p in sorted_primary_key_names:
                keys_wo_primary.remove(p)

            # e.g.
            # insert into trio.recording (col1, col2) values (%s, %s) on duplicate key update col2 = values(col2)
            # assuming that col1 is the primary key
            insert_sql = """insert into %s """ % mysql_table
            insert_sql += """ (%s) """ % (", ".join(keys))
            insert_sql += " values (%s) " % (", ".join(["%s"] * len(values) ))
            insert_sql +=  "on duplicate key update "
            insert_sql += ", ".join(["%s = values(%s)" % (k, k) for k in keys_wo_primary ])
            insert_batch[tuple(primary_tuples)] = tuple(values)
            if len(insert_batch) > 5000:
                query.executemany(insert_sql, insert_batch.values())
                sql_db.commit()
                insert_batch = {}
        else:
            # This is a delete
            if len(insert_batch) > 0 and insert_sql is not None:
                # flush all writes before processing any deletes
                query.executemany(insert_sql, insert_batch.values())
                sql_db.commit()
                insert_batch = {}

            # get the primary keys, and delete the row
            where_clause = ' and '.join([ "%s = %%s" % k for k in sorted_primary_key_names ])
            # e.g.
            # delete from trio.recording where bodyId=%s and recordingId = %s
            delete_sql = """delete from %s where %s""" % (mysql_table, where_clause)
            values = tuple(sorted_primary_key_values)
            query.execute(delete_sql, values)
            sql_db.commit()

        # how do I know when to stop?
        print "Partition %d Offset %d of %d" % (m.partition, m.offset, last_offsets.get(m.partition))
        if m.offset >= last_offsets.get(m.partition):
            finished_partitions.add(m.partition)
            if len(finished_partitions) == len(last_offsets):
                # All partitions are done.
                break

    if len(insert_batch) > 0:
        # flush any remaining writes
        query.executemany(insert_sql, insert_batch.values())
        sql_db.commit()
        insert_batch = {}

    print "Imported %d messages into mysql" % count