def get_or_create_for_checkpoint_id(cls, checkpoint_id, topics): # breaks pillowtop separation from hq from corehq.apps.change_feed.topics import get_multi_topic_first_available_offsets all_offsets = get_multi_topic_first_available_offsets(topics) already_created = list( cls.objects.filter(checkpoint_id=checkpoint_id, topic__in=topics).distinct( 'topic', 'partition').values_list( 'topic', 'partition')) to_create = [] for tp, offset in all_offsets.items(): if tp not in already_created: to_create.append( cls(checkpoint_id=checkpoint_id, topic=tp[0], partition=tp[1], offset=0)) cls.objects.bulk_create(to_create) return list( cls.objects.filter(checkpoint_id=checkpoint_id, topic__in=topics))
def test_non_expired_checkpoint_iteration_strict(self): feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], group_id='test-kafka-feed', strict=True) first_avaliable_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE]) since = { topic: first_available for topic, first_available in first_avaliable_offsets.items() } feed.iter_changes(since=since, forever=False).next()
def test_non_expired_checkpoint_iteration_strict(self): feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], group_id='test-kafka-feed', strict=True) first_avaliable_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE]) since = { topic: first_available for topic, first_available in first_avaliable_offsets.items() } feed.iter_changes(since=since, forever=False).next()
def test_non_expired_checkpoint_iteration_strict(self): feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], client_id='test-kafka-feed', strict=True) first_available_offsets = get_multi_topic_first_available_offsets( [topics.FORM, topics.CASE]) next(feed.iter_changes(since=first_available_offsets, forever=False))
def get_min_max_offsets(self): end = get_multi_topic_offset([COMMCARE_USER])[COMMCARE_USER] start = get_multi_topic_first_available_offsets([COMMCARE_USER ])[COMMCARE_USER] return start, { partition: offset - 1 for partition, offset in end.items() } # end is next available offset
def test_expired_checkpoint_iteration_strict(self): feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], group_id='test-kafka-feed', strict=True) first_available_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE]) since = { topic_partition: offset - 1 for topic_partition, offset in first_available_offsets.items() } with self.assertRaises(UnavailableKafkaOffset): next(feed.iter_changes(since=since, forever=False))
def test_expired_checkpoint_iteration_strict(self): feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], client_id='test-kafka-feed', strict=True) first_available_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE]) since = { topic_partition: offset - 1 for topic_partition, offset in first_available_offsets.items() } with self.assertRaises(UnavailableKafkaOffset): next(feed.iter_changes(since=since, forever=False))
def _create_checkpoints_from_kafka(checkpoint_id, topics): # breaks pillowtop separation from hq from corehq.apps.change_feed.topics import get_multi_topic_first_available_offsets all_offsets = get_multi_topic_first_available_offsets(topics) for tp, offset in all_offsets.items(): KafkaCheckpoint.objects.get_or_create(checkpoint_id=checkpoint_id, topic=tp[0], partition=tp[1], defaults={"offset": 0})
def _create_checkpoints_from_kafka(checkpoint_id, topics): # breaks pillowtop separation from hq from corehq.apps.change_feed.topics import get_multi_topic_first_available_offsets all_offsets = get_multi_topic_first_available_offsets(topics) for tp, offset in all_offsets.items(): KafkaCheckpoint.objects.get_or_create( checkpoint_id=checkpoint_id, topic=tp[0], partition=tp[1], defaults={"offset": 0} )
def handle(self, **options): topics = options["topics"] doc_type = options.get("doc_type") doc_subtype = options.get("doc_subtype") info = [f"Outputting changes for the [{', '.join(topics)}] topics. Press Ctrl-C to exit."] if doc_type or doc_subtype: info.append("Document filters:") if doc_type: info.append(f"\t doc_type: {doc_type}") if doc_subtype: info.append(f"\t doc_subtype: {doc_subtype}") info = "\n".join(info) self.stderr.write(f"{info}\n\n") partitions = [ TopicPartition(topic, partition) for topic, partition in get_multi_topic_first_available_offsets(topics) ] consumer = get_consumer() consumer.assign(partitions) consumer.seek_to_end(*partitions) try: while True: for message in consumer: metadata = change_from_kafka_message(message).metadata if doc_type and metadata.document_type != doc_type: continue if doc_subtype and metadata.document_subtype != doc_subtype: continue output = metadata.to_json() output["partition"] = message.partition output["offset"] = message.offset self.stdout.write(f"{metadata.document_type}: {json.dumps(output)}\n") self.stdout.flush() self.stdout.write("\nWaiting for changes...\n") except KeyboardInterrupt: return
def handle(self, **options): topic = options["topic"] doc_ids = options.get("doc_ids") if doc_ids: doc_ids = doc_ids.split(",") if options.get("doc_id_file"): if doc_ids: raise CommandError( "Can not supply doc IDs via file and via command line") with open(options["doc_id_file"], "r") as f: doc_ids = [line.strip() for line in f.readlines()] if doc_ids: # encode to match kafka message key doc_ids = {str(doc_id).encode() for doc_id in doc_ids} doc_type = options.get("doc_type") doc_subtype = options.get("doc_subtype") start, end = options.get("start_date"), options.get("end_date") info = [f"Searching the '{topic}' Kafka topic for documents matching:"] if doc_ids: info.append(f"\t doc_ids (count): {len(doc_ids)}") if doc_type: info.append(f"\t doc_type: {doc_type}") if doc_subtype: info.append(f"\t doc_subtype: {doc_subtype}") if start: info.append(f"\t published after: {start}") if end: info.append(f"\tpublished before: {end}") info = "\n".join(info) self.stderr.write(f"{info}\n\n") partitions = [ TopicPartition(topic, partition) for topic, partition in get_multi_topic_first_available_offsets([topic]) ] consumer = get_consumer() consumer.assign(partitions) if start: self.stderr.write( f"Searching for best offsets to start based on start date: {start}\n" ) offsets = get_offsets(partitions, start) for partition in partitions: consumer.seek(partition, offsets[partition]) else: consumer.seek_to_beginning(partitions) count = 0 last_progress = None buffer = [] for message in consumer: count += 1 metadata = None if count % 1000 == 0: if buffer: self.stdout.writelines(buffer) self.stdout.flush() buffer = [] metadata = change_from_kafka_message(message).metadata timestamp = metadata.publish_timestamp if not last_progress or (timestamp - last_progress ).total_seconds() > (3600 * 2): self.stderr.write( f"\nExamined {count} changes. Current point: {timestamp}\n" ) last_progress = timestamp if end and metadata.publish_timestamp > end: break if doc_ids and message.key not in doc_ids: continue if not metadata: change = change_from_kafka_message(message) metadata = change.metadata if doc_type and metadata.document_type != doc_type: continue if doc_subtype and metadata.document_subtype != doc_subtype: continue if end and metadata.publish_timestamp > end: break output = metadata.to_json() output["partition"] = message.partition output["offset"] = message.offset buffer.append(f"{json.dumps(output)}\n") if buffer: self.stdout.writelines(buffer) self.stdout.flush()
def get_min_max_offsets(self): end = get_multi_topic_offset([COMMCARE_USER])[COMMCARE_USER] start = get_multi_topic_first_available_offsets([COMMCARE_USER ])[COMMCARE_USER] return start, end - 1 # end is next available offset
def test_non_expired_checkpoint_iteration_strict(self): feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], client_id='test-kafka-feed', strict=True) first_available_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE]) next(feed.iter_changes(since=first_available_offsets, forever=False))