def _read_one_message_per_partition(self, topic_name: str, offset: Union[str, int]): config = self.config.create_kafka_python_config() with closing(kafka.KafkaConsumer(**config)) as consumer: topic_partitions = [ kafka.TopicPartition(topic=topic_name, partition=partition) for partition in consumer.partitions_for_topic(topic_name) ] partition_ends: Dict[kafka.TopicPartition, int] = consumer.end_offsets(topic_partitions) partition_starts: Dict[kafka.TopicPartition, int] = consumer.beginning_offsets( topic_partitions) if offset == "first": partition_offsets = (partition_starts[tp] for tp in topic_partitions) elif offset == "last": partition_offsets = (partition_ends[tp] - 1 for tp in topic_partitions) else: partition_offsets = (max(offset, partition_starts[tp]) for tp in topic_partitions) assignments = [ (tp, offset) for tp, offset in zip(topic_partitions, partition_offsets) if partition_ends[tp] > offset ] consumer.assign([tp for tp, _ in assignments]) for tp, offset in assignments: consumer.seek(tp, offset) unassigned_partitions = [ tp for tp in topic_partitions if tp not in consumer.assignment() ] messages_received: Dict[ int, Optional[kafka.consumer.fetcher.ConsumerRecord]] = { tp.partition: None for tp in unassigned_partitions } for message in cast( Iterable[kafka.consumer.fetcher.ConsumerRecord], consumer): if message.partition not in messages_received: messages_received[message.partition] = message consumer.pause( kafka.TopicPartition(message.topic, message.partition)) if len(messages_received) == len(topic_partitions): # we have one record for every partition, so we're done. break return messages_received
def restore_from_kafka(parser, server_list, topic): consumer = kafka.KafkaConsumer(bootstrap_servers=server_list) consumer.subscribe(topic) partition = kafka.TopicPartition(topic, 0) end_offset = consumer.end_offsets([partition]) end_offset = list(end_offset.values())[0] if end_offset > 0: print("start restore from Kafka") consumer.seek(partition, end_offset - 1) records = consumer.poll(app_config.snapshot_poll_timeout_sec * 1000) if not records: raise RuntimeError( f"No message received from Kafka during restore even though end_offset>0" ) last_msg_compress = records[partition][0] last_msg = zlib.decompress(base64.b64decode(last_msg_compress.value)) parser = jsonpickle.loads(last_msg) # After loading from Kafka:the keys of "parser.root_node.key_to_child" # are string instead of int, we cast then to int keys = [] for i in parser.root_node.key_to_child_node.keys(): keys.append(i) for key in keys: parser.root_node.key_to_child_node[int( key)] = parser.root_node.key_to_child_node.pop(key) print("end restore, number of clusters " + str(len(parser.clusters))) consumer.close() return parser
def commit_offset(self, group_id, topic, partition, offset): if self.group_id is None: raise Exception('you must enter an group_id') tp = kafka.TopicPartition(topic=str(topic), partition=int(partition)) # 分别提交offset信息至kafka and database if self.offset_store_mode == 'both': self.engine.commit( offsets={tp: (kafka.OffsetAndMetadata(offset, None))}) self.DbClient.commit_offset(topic=topic, group_id=group_id, partition=partition, offset=offset) # 提交至kafka服务器 elif self.offset_store_mode == 'kafka': self.engine.commit( offsets={tp: (kafka.OffsetAndMetadata(offset, None))}) # 提交至database else: self.DbClient.commit_offset(topic=topic, group_id=group_id, partition=partition, offset=offset)
def zk_offsets_to_kafka_offsets(self, zk_offsets, topic): kafka_offsets = {} for partition, offset in zk_offsets.items(): tp = kafka.TopicPartition(topic, int(partition)) kafka_offsets[tp] = offset return kafka_offsets
def offset_range_for_timestamp_range(brokers, start, end, topic): """Determine OffsetRange for a given timestamp range Parameters ---------- client_config : ClientConfig start : number Unix timestamp in seconds end : number Unix timestamp in seconds topic : str Topic to fetch offsets for Returns ------- list of OffsetRange or None Per-partition ranges of offsets to read """ consumer = kafka.KafkaConsumer(bootstrap_servers=brokers) partitions = consumer.partitions_for_topic(topic) if partitions is None: # Topic does not exist. return None partitions = [kafka.TopicPartition(topic, p) for p in partitions] o_start = offsets_for_times(consumer, partitions, start) o_end = offsets_for_times(consumer, partitions, end) return [OffsetRange(tp, o_start[tp], o_end[tp]) for tp in partitions]
def offset_range_for_timestamp_range(brokers, start, end, topic=mjolnir.kafka.TOPIC_RESULT): """Determine OffsetRange for a given timestamp range Parameters ---------- brokers : list of str List of kafka broker hostport to bootstrap kafka connection with start : number Unix timestamp in seconds end : number Unix timestamp in seconds topic : str Kafka topic to retrieve offsets for Returns ------- list of pyspark.streaming.kafka.OffsetRange or None Per-partition ranges of offsets to read """ consumer = kafka.KafkaConsumer(bootstrap_servers=brokers, api_version=mjolnir.kafka.BROKER_VERSION) partitions = consumer.partitions_for_topic(topic) if partitions is None: # Topic does not exist. return None partitions = [kafka.TopicPartition(topic, p) for p in partitions] o_start = offsets_for_times(consumer, partitions, start) o_end = offsets_for_times(consumer, partitions, end) return [ OffsetRange(tp.topic, tp.partition, o_start[tp], o_end[tp]) for tp in partitions ]
def main(): parser = argparse.ArgumentParser( description="Kafka client to get groups and topics status") parser.add_argument( "--server", type=str, metavar="HOST", default="localhost", help="Kafka bootstrap-server address", ) parser.add_argument( "--port", type=int, metavar="PORT", default=9092, help="Kafka bootstrap-server port", ) parser.add_argument( "--client", type=str, default="ch-kafka-python", help="custom client id for this producer", ) args = parser.parse_args() config = { "bootstrap_servers": f"{args.server}:{args.port}", "client_id": args.client, } client = kafka.KafkaAdminClient(**config) consumer = kafka.KafkaConsumer(**config) cluster = client._client.cluster topics = cluster.topics() for topic in topics: print(f'Topic "{topic}":', end="") for partition in cluster.partitions_for_topic(topic): tp = kafka.TopicPartition(topic, partition) print( f" {partition} (begin: {consumer.beginning_offsets([tp])[tp]}, end: {consumer.end_offsets([tp])[tp]})", end="", ) print() groups = client.list_consumer_groups() for group in groups: print(f'Group "{group[0]}" ({group[1]}):') consumer = kafka.KafkaConsumer(**config, group_id=group[0]) offsets = client.list_consumer_group_offsets(group[0]) for topic, offset in offsets.items(): print( f"\t{topic.topic}[{topic.partition}]: {consumer.beginning_offsets([topic])[topic]}, {offset.offset}, {consumer.end_offsets([topic])[topic]}" ) consumer.close() client.close() return 0
def latest_offsets(self): if not self._latest_offsets: consumer = self._create_util_consumer(group_none=True) partitions = [kafka.TopicPartition(self._topic, p.partition) for p in self._partitions] consumer.assign(partitions) self._latest_offsets = {p.partition: consumer.position(p) for p in partitions} consumer.close() return self._latest_offsets
def transmitterRun(self): for message in self.consumer: if message.value: try: data = str(message.value) self.s.send(data.encode()) self.offset = self.consumer.position( kafka.TopicPartition('purchaseData', 0)) print( "\noffset is", self.consumer.position( kafka.TopicPartition('purchaseData', 0))) except socket.error: print("Exception : socket error in transmitterRun()") self.consumer.close() self.s.close() self.isStopped = True self.setTransmitter()
def _get_topic_partitions(self): p = [] partitions = self.client.partitions_for_topic(self.topic_name) if not partitions: raise MissingTopicError('Could not find topic %s. Does it exist?' % self.topic_name) for partition in partitions: tp = kafka.TopicPartition(self.topic_name, partition=partition) p.append(tp) return p
def receive(bootstrap_servers, topic, callback, offset): consumer = kafka.KafkaConsumer(bootstrap_servers=bootstrap_servers, enable_auto_commit=False) partition = kafka.TopicPartition(topic, 0) consumer.assign([partition]) if offset is not None: consumer.seek(partition, offset) for msg in consumer: callback(msg)
def get_last_offset_with_context(context): consumer = kafka.KafkaConsumer( bootstrap_servers=context.kafka_bootstrap_servers, enable_auto_commit=False) partition = kafka.TopicPartition(context.kafka_topic, 0) consumer.assign([partition]) pos = consumer.position(partition) consumer.close(autocommit=False) return pos
def seek(self, consumer, topic, partition): KafkaOffset = apps.get_model(app_label='logpipe', model_name='KafkaOffset') tp = kafka.TopicPartition(topic=topic, partition=partition) try: obj = KafkaOffset.objects.get(topic=topic, partition=partition) logger.debug('Seeking to offset "%s" on topic "%s", partition "%s"' % (obj.offset, topic, partition)) consumer.client.seek(tp, obj.offset) except KafkaOffset.DoesNotExist: logger.debug('Seeking to beginning of topic "%s", partition "%s"' % (topic, partition)) consumer.client.seek_to_beginning(tp)
def doPreinit(self, mode): self._command_sender = kafka.KafkaProducer( bootstrap_servers=self.brokers) # Set up the response message consumer self._response_consumer = kafka.KafkaConsumer( bootstrap_servers=self.brokers) self._response_topic = kafka.TopicPartition(self.response_topic, 0) self._response_consumer.assign([self._response_topic]) self._response_consumer.seek_to_end() self.log.debug('Response topic consumer initial position = %s', self._response_consumer.position(self._response_topic))
def main(): parser = argparse.ArgumentParser( description='Kafka client to get groups and topics status') parser.add_argument('--server', type=str, metavar='HOST', default='localhost', help='Kafka bootstrap-server address') parser.add_argument('--port', type=int, metavar='PORT', default=9092, help='Kafka bootstrap-server port') parser.add_argument('--client', type=str, default='ch-kafka-python', help='custom client id for this producer') args = parser.parse_args() config = { 'bootstrap_servers': f'{args.server}:{args.port}', 'client_id': args.client, } client = kafka.KafkaAdminClient(**config) consumer = kafka.KafkaConsumer(**config) cluster = client._client.cluster topics = cluster.topics() for topic in topics: print(f'Topic "{topic}":', end='') for partition in cluster.partitions_for_topic(topic): tp = kafka.TopicPartition(topic, partition) print( f' {partition} (begin: {consumer.beginning_offsets([tp])[tp]}, end: {consumer.end_offsets([tp])[tp]})', end='') print() groups = client.list_consumer_groups() for group in groups: print(f'Group "{group[0]}" ({group[1]}):') consumer = kafka.KafkaConsumer(**config, group_id=group[0]) offsets = client.list_consumer_group_offsets(group[0]) for topic, offset in offsets.items(): print( f'\t{topic.topic}[{topic.partition}]: {consumer.beginning_offsets([topic])[topic]}, {offset.offset}, {consumer.end_offsets([topic])[topic]}' ) consumer.close() client.close() return 0
def rollback_offset(self, topic, partition, group_id, offset=None): if offset is None: committed_offset = self.engine.committed( kafka.TopicPartition(topic=topic, partition=partition)) if committed_offset is None: raise Exception( 'topic:%s,partition:%s,group_id:%s has not commit record yet,you should enter some offset' % (topic, partition, group_id)) self.commit_offset(group_id=group_id, topic=topic, partition=partition, offset=offset)
def assign_partition(self, topics: list): if Consumer().get_user_topics().intersection( {item['topic'] for i, item in enumerate(topics)}): for v in topics: tp = kafka.TopicPartition(topic=str(v['topic']), partition=int(v['partition'])) self.tps.append(tp) self.engine.assign(self.tps) else: raise Exception('topics包含了未知的topic' % topics) self._partition_mode = '1' return self
def info(self): """Print the offset information for all topics and partitions.""" print('Offsets per Topic:') for topic in self._consumer.topics(): print('\nTopic {}:\n'.format(topic)) partitions = self._consumer.partitions_for_topic(topic) if partitions is None: # pragma: no cover print(' Polling failed (please try again)') continue for partition in self._consumer.partitions_for_topic(topic): topic_partition = kafka.TopicPartition(topic, partition) self._consumer.assign([topic_partition]) offset = self._consumer.position(topic_partition) print(' Partition {:<3}: {}'.format(partition, offset))
def _get_result_offsets(self): """Get the latest offsets for all partitions in topic""" consumer = kafka.KafkaConsumer( bootstrap_servers=self.brokers, auto_offset_reset='latest', api_version=mjolnir.kafka.BROKER_VERSION) partitions = [ kafka.TopicPartition(self.topic_result, p) for p in consumer.partitions_for_topic(self.topic_result) ] consumer.assign(partitions) consumer.seek_to_end() offsets = [consumer.position(tp) for tp in partitions] consumer.close() return offsets
def get_offsets_from_kafka(self, group_id, topic, kafka_hosts): consumer = kafka.KafkaConsumer(bootstrap_servers=kafka_hosts, group_id=group_id, enable_auto_commit=False) consumer.topics() partitions = consumer.partitions_for_topic(topic) topic_partition = map(lambda p: kafka.TopicPartition(topic, p), partitions) kafka_offsets = {tp: consumer.committed(tp) for tp in topic_partition} consumer.close() return kafka_offsets
def _create_scan_consumer(self, partitions=None): self.consumer = kafka.KafkaConsumer( bootstrap_servers=self._brokers, group_id=self._group, enable_auto_commit=False, consumer_timeout_ms=1000, request_timeout_ms=120000, auto_offset_reset='earliest', api_version=self._api_version, max_partition_fetch_bytes=self._max_partition_fetch_bytes, **self._ssl_configs) partitions = partitions or [] partitions = [kafka.TopicPartition(self._topic, p) for p in partitions] self.consumer.assign(partitions or self._partitions) self.processor.set_consumer(self.consumer)
def __init__(self, pull_config, push_config, pull_class=kafka.KafkaConsumer, storage_class=PostgresStorage): pull_config = copy.copy(pull_config) push_config = copy.copy(push_config) self.raw_data = None self.data = None self.storage = storage_class(**push_config) topics = pull_config.get('topics') if topics: del pull_config['topics'] log.info('Initializing consumer') self.provider = pull_class(group_id='subscriber', **pull_config) log.info(f'assigning topic {topics} partition 0') self.provider.assign([kafka.TopicPartition(topics, 0)])
def get_offsets_closest_to_timestamp( self, topic_name: str, timestamp: pendulum.DateTime) -> Dict[int, OffsetWithTimestamp]: """ Gets the offsets of the message(s) in `topic_name` whose timestamps are at, or right after, the given `timestamp`. The timestamps given in the result are the actual timestamp of the offset that was found. If there is no message at or after the given `timestamp`, the resulting offset will be `-1` i.e. end of topic partition _not including_ the last message. :param topic_name: The topic to get the offsets for. :param timestamp: The timestamp to find offsets for. :return: Dict: partition id -> offset with timestamp. """ config = self.config.create_kafka_python_config() with closing(kafka.KafkaConsumer(**config)) as consumer: topic_partitions = [ kafka.TopicPartition(topic=topic_name, partition=partition) for partition in consumer.partitions_for_topic(topic_name) ] timestamp_ms = int(timestamp.timestamp() * 1000) offsets: Dict[ kafka.TopicPartition, kafka.structs.OffsetAndTimestamp] = consumer.offsets_for_times( {tp: timestamp_ms for tp in topic_partitions}) data: Dict[int, OffsetWithTimestamp] = {} for tp, offset_data in offsets.items(): if offset_data is None: data[tp.partition] = OffsetWithTimestamp( topic=tp.topic, partition=tp.partition, offset=OFFSET_END, timestamp_ms=None) else: data[tp.partition] = OffsetWithTimestamp( topic=tp.topic, partition=tp.partition, offset=offset_data.offset, timestamp_ms=offset_data.timestamp, ) return data
def load_state(self): consumer = kafka.KafkaConsumer(bootstrap_servers=self.server_list) partition = kafka.TopicPartition(self.topic, 0) consumer.assign([partition]) end_offsets = consumer.end_offsets([partition]) end_offset = list(end_offsets.values())[0] if end_offset > 0: consumer.seek(partition, end_offset - 1) snapshot_poll_timeout_ms = int(config.get('DEFAULT', 'snapshot_poll_timeout_sec', fallback=60)) * 1000 records = consumer.poll(snapshot_poll_timeout_ms) if not records: raise RuntimeError(f"No message received from Kafka during restore even though end_offset>0") last_msg = records[partition][0] state = last_msg.value else: state = None consumer.close() return state
def __init__(self, _id, topic, pool, data, redis_conn): self._i = 0 self._id = _id self._key = f'{topic}:{_id}' self._pool = pool self._duels = data self._conns = {} self._funcs = {} self._topic = topic self._redis = redis_conn self._consumer = kafka.KafkaConsumer( bootstrap_servers=cfg.KAFKA_SERVERS, group_id=topic) self._consumer.assign([kafka.TopicPartition(topic, _id)]) self._producer = kafka.KafkaProducer( bootstrap_servers=cfg.KAFKA_SERVERS) self._od = OrderedDict() self._stock = OrderedDict() self._work_queue: asyncio.Queue = None self._wait_queue: asyncio.Queue = None
def load_state(self): consumer = kafka.KafkaConsumer(**self.kafka_client_options) partition = kafka.TopicPartition(self.topic, 0) consumer.assign([partition]) end_offsets = consumer.end_offsets([partition]) end_offset = list(end_offsets.values())[0] if end_offset > 0: consumer.seek(partition, end_offset - 1) snapshot_poll_timeout_ms = self.snapshot_poll_timeout_sec * 1000 records = consumer.poll(snapshot_poll_timeout_ms) if not records: raise RuntimeError( f"No message received from Kafka during restore even though end_offset>0" ) last_msg = records[partition][0] state = last_msg.value else: state = None consumer.close() return state
def test_offset_for_times(mocker): partitions = [kafka.TopicPartition('ut_topic', 0)] offsets_for_times = {tp: OffsetAndTimestamp(42, -1) for tp in partitions} positions = {tp: 747 for tp in partitions} mock = mocker.Mock() mock.offsets_for_times.return_value = offsets_for_times mock.position.side_effect = lambda tp: positions.get(tp, 0) # Uses returned offset for time when provided offsets = client.offsets_for_times(mock, partitions, 987654321) assert len(offsets) == len(partitions) assert all(tp in offsets for tp in partitions) assert offsets[partitions[0]] == 42 # When offsets_for_times returns None returns position at end offsets_for_times[partitions[0]] = None offsets = client.offsets_for_times(mock, partitions, 987654321) assert len(offsets) == len(partitions) assert all(tp in offsets for tp in partitions) assert offsets[partitions[0]] == 747
def subscribe(self, topic): """ Create the thread that provides call backs on new messages """ # Remove all the assigned topics self._consumer.unsubscribe() topics = self._consumer.topics() if topic not in topics: raise ConfigurationError('Provided topic %s does not exist' % topic) # Assign the partitions partitions = self._consumer.partitions_for_topic(topic) if not partitions: raise ConfigurationError('Cannot query partitions for %s' % topic) self._consumer.assign([kafka.TopicPartition(topic, p) for p in partitions]) self._stoprequest = False self._updater_thread = createThread('updater_' + topic, self._get_new_messages) self.log.debug('subscribed to updates from topic: %s' % topic)
def main(importkey, exportkey, seekval): kafka_import = importkey.replace(":", "_") helper_utils.std_flush("Generated kafka import key %s" % kafka_import) kafka_export = exportkey.replace(":", "_") helper_utils.std_flush("Generated kafka export key %s" % kafka_export) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) r = redis.Redis(connection_pool=pool) helper_utils.std_flush("Connected to redis") seek_partition = r.get(exportkey + ":partition") seek_offset = r.get(exportkey + ":offset") seek_partition = 0 if seek_partition is None else int(seek_partition) seek_offset = 0 if seek_offset is None else int(seek_offset) + 1 helper_utils.std_flush( "Obtained seek partition for kafka at Partition %i -- Offset %i" % (seek_partition, seek_offset)) if seekval is not None: seek_offset = seekval helper_utils.std_flush( "Replaced seek offset for kafka at Partition %i -- Offset %i" % (seek_partition, seek_offset)) helper_utils.std_flush("\n\n") kafka_consumer = kafka.KafkaConsumer() helper_utils.std_flush("Generated kafka consumer") TopicPartition = kafka.TopicPartition(kafka_import, seek_partition) kafka_consumer.assign([TopicPartition]) kafka_consumer.seek(TopicPartition, seek_offset) helper_utils.std_flush("Set kafka consumer seek") count = 0 for message in kafka_consumer: #pdb.set_trace() count += 1 jsval = json.loads(message.value.decode()) helper_utils.std_flush(jsval["streamtype"], str(count))
def read_offset_range(offset_range): if offset_range.end <= offset_range.start: # Raise exception? return # After serialization round trip these fail an isinstance check. # re-instantiate so we have the expected thing. tp = kafka.TopicPartition(*offset_range.tp) consumer = kafka.KafkaConsumer( bootstrap_servers=client_config.brokers, value_deserializer=lambda x: json.loads(x.decode('utf8'))) try: consumer.assign([tp]) consumer.seek(tp, offset_range.start) while True: poll_response = consumer.poll(timeout_ms=10000) if poll_response and tp in poll_response: for message in poll_response[tp]: if message.offset > offset_range.end: break yield message.value if consumer.position(tp) >= offset_range.end: break finally: consumer.close()