class Kafka(object): executor = ThreadPoolExecutor(20) def __init__(self, broker): self.broker = broker self.client = KafkaClient(broker, timeout=3) @run_on_executor def getPartition(self, topic): """ 指定topic返回partition列表 """ return self.client.get_partition_ids_for_topic(topic) @run_on_executor def getLogsize(self, topic, partitions): """ 指定topic与partition列表, 返回logsize数据 """ tp = self.client.send_offset_request( [OffsetRequestPayload(topic, p, -1, 1) for p in partitions]) return {p.partition: p.offsets[0] for p in tp} @run_on_executor def getOffsets(self, topic, partitions, group): """ 指定topic、partition和group, 返回offsets数据 """ try: # 尝试使用zookeeper-storage api获取offsets数据 # 未获得指定group的offsets数据将抛出UnknownTopicOrPartitionError异常 tp = self.client.send_offset_fetch_request( group, [OffsetRequestPayload(topic, p, -1, 1) for p in partitions]) offsets = {p.partition: p.offset for p in tp} except UnknownTopicOrPartitionError: # 收到异常后使用kafka-storage api获取offsets数据 consumer = KafkaConsumer(group_id=group, bootstrap_servers=self.broker, enable_auto_commit=False) tp = [TopicPartition(topic, p) for p in partitions] consumer.assign(tp) offsets = {p.partition: consumer.position(p) for p in tp} return offsets
def handler(self): """ 查询指定Kafka集群Topic中每个Partition当前Logsize, 将Logsize写入LevelDB 每次收集Logsize数据后会检测retention_day参数,删除过期数据 """ clusters = base.config["collector"]["clusters"] for cluster, metric in clusters.items(): client = KafkaClient(metric["brokers"], timeout=3) for topic in metric["topics"]: partitions = client.get_partition_ids_for_topic(topic) payload = [ OffsetRequestPayload(topic, p, -1, 1) for p in partitions ] logsize = { p.partition: p.offsets[0] for p in client.send_offset_request(payload) } if logsize: key = str(int(time.time())).encode("utf-8") value = json.dumps(logsize).encode("utf-8") db = base.init_leveldb(cluster=cluster, topic=topic) db.Put(key, value) deadline = base.config["collector"]["clusters"][cluster][ "retention_hour"] * 3600 for key, _ in db.RangeIter(): if time.time() - int(key) > deadline: db.Delete(key) else: break client.close()
def __init__(self, settings): super(HHStrategyWorker, self).__init__(settings, topic) self.slot = Slot(log_processing=self.work, incoming=self.incoming, outgoing=self.outgoing, is_master=settings.get("FRONTERA_MASTER")) kafka_hh = KafkaClient(settings.get('KAFKA_LOCATION_HH')) self.consumer_hh = SimpleConsumer( kafka_hh, settings.get('FRONTERA_GROUP'), settings.get('FRONTERA_INCOMING_TOPIC'), buffer_size=262144, max_buffer_size=10485760, auto_commit_every_n=1) self.producer_hh = SimpleProducer(kafka_hh) self.results_topic = settings.get("FRONTERA_RESULTS_TOPIC") self.job_config = {} self.zookeeper = ZookeeperSession(settings.get('ZOOKEEPER_LOCATION'), name_prefix=self.worker_prefix) kafka = KafkaClient(settings.get('KAFKA_LOCATION')) self.partitions_count = len( kafka.get_partition_ids_for_topic(settings.get('INCOMING_TOPIC'))) self.null_cycles = 0
def spoorer(self): try: kafka_client = KafkaClient(self.kafka_hosts, timeout=self.timeout) except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) else: kafka_topics = kafka_client.topics finally: kafka_client.close() try: zookeeper_client = KazooClient(hosts=self.zookeeper_hosts, read_only=True, timeout=self.timeout) zookeeper_client.start() except Exception as e: print "Error, cannot connect zookeeper server." sys.exit(1) try: groups = map(str,zookeeper_client.get_children(self.zookeeper_url + 'consumers')) except NoNodeError as e: print "Error, invalid zookeeper url." zookeeper_client.stop() sys.exit(2) else: for group in groups: if 'offsets' not in zookeeper_client.get_children(self.zookeeper_url + 'consumers/%s' % group): continue topic_path = 'consumers/%s/offsets' % (group) topics = map(str,zookeeper_client.get_children(self.zookeeper_url + topic_path)) if len(topics) == 0: continue for topic in topics: if topic not in self.white_topic_group.keys(): continue elif group not in self.white_topic_group[topic].replace(' ','').split(','): continue partition_path = 'consumers/%s/offsets/%s' % (group,topic) partitions = map(int,zookeeper_client.get_children(self.zookeeper_url + partition_path)) for partition in partitions: base_path = 'consumers/%s/%s/%s/%s' % (group, '%s', topic, partition) owner_path, offset_path = base_path % 'owners', base_path % 'offsets' offset = zookeeper_client.get(self.zookeeper_url + offset_path)[0] try: owner = zookeeper_client.get(self.zookeeper_url + owner_path)[0] except NoNodeError as e: owner = 'null' metric = {'datetime':time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'topic':topic, 'group':group, 'partition':int(partition), 'logsize':None, 'offset':int(offset), 'lag':None, 'owner':owner} self.result.append(metric) finally: zookeeper_client.stop() try: kafka_consumer = KafkaConsumer(bootstrap_servers=self.kafka_hosts) except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) else: for kafka_topic in kafka_topics: self.kafka_logsize[kafka_topic] = {} partitions = kafka_client.get_partition_ids_for_topic(kafka_topic) for partition in partitions: offset = kafka_consumer.get_partition_offsets(kafka_topic, partition, -1, 1)[0] self.kafka_logsize[kafka_topic][partition] = offset with open(self.log_file,'w') as f1, open(self.log_day_file,'a') as f2: for metric in self.result: logsize = self.kafka_logsize[metric['topic']][metric['partition']] metric['logsize'] = int(logsize) metric['lag'] = int(logsize) - int(metric['offset']) f1.write(json.dumps(metric,sort_keys=True) + '\n') f1.flush() f2.write(json.dumps(metric,sort_keys=True) + '\n') f2.flush() finally: kafka_consumer.close() return ''
Usage: k-debug.py <host> """ from kafka import SimpleProducer, KafkaClient import logging import sys logging.basicConfig() kafka = KafkaClient(sys.argv[1] + ':9092') #producer = SimpleProducer(kafka) kafka.ensure_topic_exists(b'picasso-stackato-logs') print("Client: {0!r}".format(kafka)) md = kafka.send_metadata_request() print(" {0!r}".format(md)) for t in kafka.topics: print("{0!r}:".format(t)) print(" partitions: {0!r}:".format(kafka.get_partition_ids_for_topic(t))) #kafka.ensure_topic_exists(b'my-topic') print("done.") ## end
def main(): # R0915: "too many statements in function (>50)" # pylint: disable=R0915 if len(sys.argv) != 8: print "Wrong number of arguments" usage() (kafka_topic, kafka_broker, mysql_host, mysql_port, mysql_user, mysql_password, mysql_table) = sys.argv[1:8] sql_db = MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_password) query = sql_db.cursor() client = KafkaClient(kafka_broker) consumer = KafkaConsumer(kafka_topic, metadata_broker_list=[kafka_broker], auto_commit_enable=False, auto_offset_reset='smallest') last_offsets = {} partition_ids = client.get_partition_ids_for_topic(kafka_topic) for partition in partition_ids: offsets = consumer.get_partition_offsets(kafka_topic, partition, -1, 1) print offsets # Don't really understand this format, so put in asserts # (Pdb) consumer.get_partition_offsets("topicname", 0, -1, 1) # (15471) assert len(offsets) == 1 assert offsets[0] > 0 next_offset = offsets[0] last_offset = next_offset - 1 last_offsets[partition] = last_offset finished_partitions = set() print last_offsets count = 0 # mapping from primary key tuples, to row data insert_batch = {} insert_sql = None for m in consumer: if m.partition in finished_partitions: continue count += 1 payload = m.value (first_line, rest) = payload.split("\r\n", 1) (_notused, header_len, _body_len) = first_line.split(" ") header_len = int(header_len) body = rest[header_len:] primary_key_str = m.key # import pdb; pdb.set_trace() primary_keys = json.loads(primary_key_str) primary_tuples = sorted(primary_keys.items()) sorted_primary_key_names = [k for (k, v) in primary_tuples] sorted_primary_key_values = [int(v) for (k, v) in primary_tuples] if len(body) > 0: # This is a write data = json.loads(body) # date fields have to be turned from a number back into a datetime object date_fields = ['createDate', 'updateDate'] for d in date_fields: if d not in data: continue val = data[d] if val is None: continue if val == -62170156800000: # this is hacky and a sign that i'm doing something wrong, I think. val = "0000-00-00 00:00:00" else: val = val / 1000 import datetime val = datetime.datetime.utcfromtimestamp(val) data[d] = val keys = [k for (k, v) in sorted(data.items())] values = [v for (k, v) in sorted(data.items())] keys_wo_primary = [k for (k, v) in sorted(data.items())] for p in sorted_primary_key_names: keys_wo_primary.remove(p) # e.g. # insert into dbname.tablename (col1, col2) values (%s, %s) on duplicate key update col2 = values(col2) # assuming that col1 is the primary key insert_sql = """insert into %s """ % mysql_table insert_sql += """ (%s) """ % (", ".join(keys)) insert_sql += " values (%s) " % (", ".join(["%s"] * len(values))) insert_sql += "on duplicate key update " insert_sql += ", ".join( ["%s = values(%s)" % (k, k) for k in keys_wo_primary]) insert_batch[tuple(primary_tuples)] = tuple(values) if len(insert_batch) > 5000: query.executemany(insert_sql, insert_batch.values()) sql_db.commit() insert_batch = {} else: # This is a delete if len(insert_batch) > 0 and insert_sql is not None: # flush all writes before processing any deletes query.executemany(insert_sql, insert_batch.values()) sql_db.commit() insert_batch = {} # get the primary keys, and delete the row where_clause = ' and '.join( ["%s = %%s" % k for k in sorted_primary_key_names]) # e.g. # delete from dbname.tablename where field1 = %s and field2 = %s delete_sql = """delete from %s where %s""" % (mysql_table, where_clause) values = tuple(sorted_primary_key_values) query.execute(delete_sql, values) sql_db.commit() # how do I know when to stop? print "Partition %d Offset %d of %d" % (m.partition, m.offset, last_offsets.get(m.partition)) if m.offset >= last_offsets.get(m.partition): finished_partitions.add(m.partition) if len(finished_partitions) == len(last_offsets): # All partitions are done. break if len(insert_batch) > 0: # flush any remaining writes query.executemany(insert_sql, insert_batch.values()) sql_db.commit() insert_batch = {} print "Imported %d messages into mysql" % count
import sys import getopt import json from pprint import pformat from confluent_kafka import Consumer, KafkaException from kafka import KafkaClient try: client = KafkaClient( ['10.156.0.3:6667', '10.156.0.4:6667', '10.156.0.5:6667']) topic_partition_ids = client.get_partition_ids_for_topic( 'mles.announcements') x = len(topic_partition_ids) client.close() except Exception as ex: print("client:: error:") print(ex) finally: client.close() # Consumer configuration conf = { 'bootstrap.servers': ['10.156.0.3:6667', '10.156.0.4:6667', '10.156.0.5:6667'], 'group.id': 'sschokorov' } # Create Consumer instance xs = [2 * i for i in range(x)]
def main(): # R0915: "too many statements in function (>50)" # pylint: disable=R0915 if len(sys.argv) != 8: print "Wrong number of arguments" usage() (kafka_topic, kafka_broker, mysql_host, mysql_port, mysql_user, mysql_password, mysql_table) = sys.argv[1:8] sql_db = MySQLdb.connect( host = mysql_host, port = int(mysql_port), user = mysql_user, passwd = mysql_password) query = sql_db.cursor() client = KafkaClient(kafka_broker) consumer = KafkaConsumer(kafka_topic, metadata_broker_list = [kafka_broker], auto_commit_enable = False, auto_offset_reset='smallest') last_offsets = {} partition_ids = client.get_partition_ids_for_topic(kafka_topic) for partition in partition_ids: offsets = consumer.get_partition_offsets(kafka_topic, partition, -1, 1) print offsets # Don't really understand this format, so put in asserts # (Pdb) consumer.get_partition_offsets("appdb.bclab1.tivo.com", 0, -1, 1) # (15471) assert len(offsets) == 1 assert offsets[0] > 0 next_offset = offsets[0] last_offset = next_offset - 1 last_offsets[partition] = last_offset finished_partitions = set() print last_offsets count = 0 # mapping from primary key tuples, to row data insert_batch = {} insert_sql = None for m in consumer: if m.partition in finished_partitions: continue count += 1 payload = m.value (first_line, rest) = payload.split("\r\n", 1) (_notused, header_len, _body_len) = first_line.split(" ") header_len = int(header_len) body = rest[header_len:] primary_key_str = m.key # import pdb; pdb.set_trace() primary_keys = json.loads(primary_key_str) primary_tuples = sorted(primary_keys.items()) sorted_primary_key_names = [ k for (k,v) in primary_tuples ] sorted_primary_key_values = [ int(v) for (k,v) in primary_tuples ] if len(body) > 0: # This is a write data = json.loads(body) # date fields have to be turned from a number back into a datetime object date_fields = ['createDate', 'endTime', 'expectedDeletion', 'startTime', 'updateDate', 'availabilityWindowStart', 'availabilityWindowEnd', 'entitlementWindowStart', 'entitlementWindowEnd'] for d in date_fields: if d not in data: continue val = data[d] if val is None: continue if val == -62170156800000: # this is hacky and a sign that i'm doing something wrong, I think. val = "0000-00-00 00:00:00" else: val = val/1000 import datetime; val = datetime.datetime.utcfromtimestamp(val) data[d] = val keys = [ k for (k, v) in sorted(data.items()) ] values = [ v for (k, v) in sorted(data.items()) ] keys_wo_primary = [ k for (k, v) in sorted(data.items()) ] for p in sorted_primary_key_names: keys_wo_primary.remove(p) # e.g. # insert into trio.recording (col1, col2) values (%s, %s) on duplicate key update col2 = values(col2) # assuming that col1 is the primary key insert_sql = """insert into %s """ % mysql_table insert_sql += """ (%s) """ % (", ".join(keys)) insert_sql += " values (%s) " % (", ".join(["%s"] * len(values) )) insert_sql += "on duplicate key update " insert_sql += ", ".join(["%s = values(%s)" % (k, k) for k in keys_wo_primary ]) insert_batch[tuple(primary_tuples)] = tuple(values) if len(insert_batch) > 5000: query.executemany(insert_sql, insert_batch.values()) sql_db.commit() insert_batch = {} else: # This is a delete if len(insert_batch) > 0 and insert_sql is not None: # flush all writes before processing any deletes query.executemany(insert_sql, insert_batch.values()) sql_db.commit() insert_batch = {} # get the primary keys, and delete the row where_clause = ' and '.join([ "%s = %%s" % k for k in sorted_primary_key_names ]) # e.g. # delete from trio.recording where bodyId=%s and recordingId = %s delete_sql = """delete from %s where %s""" % (mysql_table, where_clause) values = tuple(sorted_primary_key_values) query.execute(delete_sql, values) sql_db.commit() # how do I know when to stop? print "Partition %d Offset %d of %d" % (m.partition, m.offset, last_offsets.get(m.partition)) if m.offset >= last_offsets.get(m.partition): finished_partitions.add(m.partition) if len(finished_partitions) == len(last_offsets): # All partitions are done. break if len(insert_batch) > 0: # flush any remaining writes query.executemany(insert_sql, insert_batch.values()) sql_db.commit() insert_batch = {} print "Imported %d messages into mysql" % count