Beispiel #1
0
    def get_or_create_for_checkpoint_id(cls, checkpoint_id, topics):
        # breaks pillowtop separation from hq
        from corehq.apps.change_feed.topics import get_multi_topic_first_available_offsets

        all_offsets = get_multi_topic_first_available_offsets(topics)

        already_created = list(
            cls.objects.filter(checkpoint_id=checkpoint_id,
                               topic__in=topics).distinct(
                                   'topic', 'partition').values_list(
                                       'topic', 'partition'))

        to_create = []

        for tp, offset in all_offsets.items():
            if tp not in already_created:
                to_create.append(
                    cls(checkpoint_id=checkpoint_id,
                        topic=tp[0],
                        partition=tp[1],
                        offset=0))

        cls.objects.bulk_create(to_create)

        return list(
            cls.objects.filter(checkpoint_id=checkpoint_id, topic__in=topics))
Beispiel #2
0
 def test_non_expired_checkpoint_iteration_strict(self):
     feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], group_id='test-kafka-feed', strict=True)
     first_avaliable_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE])
     since = {
         topic: first_available for topic, first_available in first_avaliable_offsets.items()
     }
     feed.iter_changes(since=since, forever=False).next()
Beispiel #3
0
 def test_non_expired_checkpoint_iteration_strict(self):
     feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], group_id='test-kafka-feed', strict=True)
     first_avaliable_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE])
     since = {
         topic: first_available for topic, first_available in first_avaliable_offsets.items()
     }
     feed.iter_changes(since=since, forever=False).next()
 def test_non_expired_checkpoint_iteration_strict(self):
     feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE],
                            client_id='test-kafka-feed',
                            strict=True)
     first_available_offsets = get_multi_topic_first_available_offsets(
         [topics.FORM, topics.CASE])
     next(feed.iter_changes(since=first_available_offsets, forever=False))
Beispiel #5
0
 def get_min_max_offsets(self):
     end = get_multi_topic_offset([COMMCARE_USER])[COMMCARE_USER]
     start = get_multi_topic_first_available_offsets([COMMCARE_USER
                                                      ])[COMMCARE_USER]
     return start, {
         partition: offset - 1
         for partition, offset in end.items()
     }  # end is next available offset
 def test_expired_checkpoint_iteration_strict(self):
     feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], group_id='test-kafka-feed', strict=True)
     first_available_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE])
     since = {
         topic_partition: offset - 1
         for topic_partition, offset in first_available_offsets.items()
     }
     with self.assertRaises(UnavailableKafkaOffset):
         next(feed.iter_changes(since=since, forever=False))
 def test_expired_checkpoint_iteration_strict(self):
     feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], client_id='test-kafka-feed', strict=True)
     first_available_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE])
     since = {
         topic_partition: offset - 1
         for topic_partition, offset in first_available_offsets.items()
     }
     with self.assertRaises(UnavailableKafkaOffset):
         next(feed.iter_changes(since=since, forever=False))
Beispiel #8
0
def _create_checkpoints_from_kafka(checkpoint_id, topics):
    # breaks pillowtop separation from hq
    from corehq.apps.change_feed.topics import get_multi_topic_first_available_offsets

    all_offsets = get_multi_topic_first_available_offsets(topics)

    for tp, offset in all_offsets.items():
        KafkaCheckpoint.objects.get_or_create(checkpoint_id=checkpoint_id,
                                              topic=tp[0],
                                              partition=tp[1],
                                              defaults={"offset": 0})
Beispiel #9
0
def _create_checkpoints_from_kafka(checkpoint_id, topics):
    # breaks pillowtop separation from hq
    from corehq.apps.change_feed.topics import get_multi_topic_first_available_offsets

    all_offsets = get_multi_topic_first_available_offsets(topics)

    for tp, offset in all_offsets.items():
        KafkaCheckpoint.objects.get_or_create(
            checkpoint_id=checkpoint_id,
            topic=tp[0], partition=tp[1],
            defaults={"offset": 0}
        )
Beispiel #10
0
    def handle(self, **options):
        topics = options["topics"]

        doc_type = options.get("doc_type")
        doc_subtype = options.get("doc_subtype")

        info = [f"Outputting changes for the [{', '.join(topics)}] topics. Press Ctrl-C to exit."]
        if doc_type or doc_subtype:
            info.append("Document filters:")
        if doc_type:
            info.append(f"\t        doc_type: {doc_type}")
        if doc_subtype:
            info.append(f"\t     doc_subtype: {doc_subtype}")
        info = "\n".join(info)
        self.stderr.write(f"{info}\n\n")

        partitions = [
            TopicPartition(topic, partition)
            for topic, partition in get_multi_topic_first_available_offsets(topics)
        ]

        consumer = get_consumer()
        consumer.assign(partitions)
        consumer.seek_to_end(*partitions)

        try:
            while True:
                for message in consumer:
                    metadata = change_from_kafka_message(message).metadata

                    if doc_type and metadata.document_type != doc_type:
                        continue
                    if doc_subtype and metadata.document_subtype != doc_subtype:
                        continue

                    output = metadata.to_json()
                    output["partition"] = message.partition
                    output["offset"] = message.offset
                    self.stdout.write(f"{metadata.document_type}: {json.dumps(output)}\n")
                    self.stdout.flush()

                self.stdout.write("\nWaiting for changes...\n")
        except KeyboardInterrupt:
            return
Beispiel #11
0
    def handle(self, **options):
        topic = options["topic"]

        doc_ids = options.get("doc_ids")
        if doc_ids:
            doc_ids = doc_ids.split(",")

        if options.get("doc_id_file"):
            if doc_ids:
                raise CommandError(
                    "Can not supply doc IDs via file and via command line")

            with open(options["doc_id_file"], "r") as f:
                doc_ids = [line.strip() for line in f.readlines()]

        if doc_ids:
            # encode to match kafka message key
            doc_ids = {str(doc_id).encode() for doc_id in doc_ids}

        doc_type = options.get("doc_type")
        doc_subtype = options.get("doc_subtype")

        start, end = options.get("start_date"), options.get("end_date")

        info = [f"Searching the '{topic}' Kafka topic for documents matching:"]
        if doc_ids:
            info.append(f"\t doc_ids (count): {len(doc_ids)}")
        if doc_type:
            info.append(f"\t        doc_type: {doc_type}")
        if doc_subtype:
            info.append(f"\t     doc_subtype: {doc_subtype}")
        if start:
            info.append(f"\t published after: {start}")
        if end:
            info.append(f"\tpublished before: {end}")
        info = "\n".join(info)
        self.stderr.write(f"{info}\n\n")

        partitions = [
            TopicPartition(topic, partition) for topic, partition in
            get_multi_topic_first_available_offsets([topic])
        ]

        consumer = get_consumer()

        consumer.assign(partitions)
        if start:
            self.stderr.write(
                f"Searching for best offsets to start based on start date: {start}\n"
            )
            offsets = get_offsets(partitions, start)
            for partition in partitions:
                consumer.seek(partition, offsets[partition])
        else:
            consumer.seek_to_beginning(partitions)

        count = 0
        last_progress = None
        buffer = []
        for message in consumer:
            count += 1

            metadata = None
            if count % 1000 == 0:
                if buffer:
                    self.stdout.writelines(buffer)
                    self.stdout.flush()
                    buffer = []
                metadata = change_from_kafka_message(message).metadata
                timestamp = metadata.publish_timestamp
                if not last_progress or (timestamp - last_progress
                                         ).total_seconds() > (3600 * 2):
                    self.stderr.write(
                        f"\nExamined {count} changes. Current point: {timestamp}\n"
                    )
                    last_progress = timestamp

                if end and metadata.publish_timestamp > end:
                    break

            if doc_ids and message.key not in doc_ids:
                continue

            if not metadata:
                change = change_from_kafka_message(message)
                metadata = change.metadata

            if doc_type and metadata.document_type != doc_type:
                continue
            if doc_subtype and metadata.document_subtype != doc_subtype:
                continue

            if end and metadata.publish_timestamp > end:
                break

            output = metadata.to_json()
            output["partition"] = message.partition
            output["offset"] = message.offset
            buffer.append(f"{json.dumps(output)}\n")

        if buffer:
            self.stdout.writelines(buffer)
            self.stdout.flush()
Beispiel #12
0
 def get_min_max_offsets(self):
     end = get_multi_topic_offset([COMMCARE_USER])[COMMCARE_USER]
     start = get_multi_topic_first_available_offsets([COMMCARE_USER
                                                      ])[COMMCARE_USER]
     return start, end - 1  # end is next available offset
 def test_non_expired_checkpoint_iteration_strict(self):
     feed = KafkaChangeFeed(topics=[topics.FORM, topics.CASE], client_id='test-kafka-feed', strict=True)
     first_available_offsets = get_multi_topic_first_available_offsets([topics.FORM, topics.CASE])
     next(feed.iter_changes(since=first_available_offsets, forever=False))