Exemple #1
0
 def get_some_message(self,num,start_offset=None):
     #start_offset:         为None则以该组在kafka server上的偏移量开始拉去
     #message_num:          需要获取的消息量
     #groupid :             consumer组信息
     #client_id:            consumer client信息
     #topic:                消费的topic
     #partition             消费的partition
     #bootstrap_servers:   kafka服务器信息(localhost:9092)
     if start_offset != None:
         self.consumer.seek(self.topic_partition, start_offset)
         self.consumer.commit(
             offsets={self.topic_partition: OffsetAndMetadata(start_offset, None)})
     res = self.consumer.poll(timeout_ms=5000, max_records=num)
     if res != {}:
         for i in res.values():
             for j in i:
                 print(j)
                 self.message.append(j.value)
         a = len(i)
         old_offset = self.consumer.committed(self.topic_partition)
         print(old_offset)
         new_offset = a + old_offset
         self.consumer.commit(
         offsets={self.topic_partition: OffsetAndMetadata(new_offset, None)})
     else:
         a=0
     return dict({'message_num':a,'message_list':self.message})
def test_convert_partition_offsets_translates_partition_offsets_to_committable_topic_offsets(
):
    offsets = convert_partition_offsets('foo', {0: 100, 1: 200})
    assert offsets == {
        TopicPartition(topic='foo', partition=0):
        OffsetAndMetadata(offset=100, metadata=''),
        TopicPartition(topic='foo', partition=1):
        OffsetAndMetadata(offset=200, metadata='')
    }
    def consume(self) -> bool:
        """
        Consumes a single message from the subscribed topic and indexes it into the elasticsearch index.

        Returns True if successful, False otherwise.
        """
        message = next(self._consumer)

        key = message.key.decode('utf-8')
        try:
            value = json.loads(message.value.decode('utf-8'))
        except JSONDecodeError as ex:
            value = {
                'message': message.value.decode('utf-8'),
                'error': '{}'.format(ex)
            }
        result = self._index.index_into(value, key)

        if result:
            for assignment in self._consumer.assignment():
                pos = self._consumer.position(assignment)
                if pos != self._consumer.committed(assignment):
                    self._consumer.commit(
                        {assignment: OffsetAndMetadata(pos, "")})
        # self._time_logger.info("Consumed and indexed one message.")
        return result
def consume_one_message_at_a_time(conf):

    kafka_brokers = conf.KAFKA_BROKERS
    topic = conf.demo_topic
    group_id = f'{conf.demo_group_id}_2'

    print(
        f'KAFKA_BROKERS: {kafka_brokers}\n Topic {topic}\n group id: {group_id}'
    )

    consumer = KafkaConsumer(topic,
                             bootstrap_servers=kafka_brokers,
                             group_id=group_id,
                             enable_auto_commit=False,
                             max_poll_records=1)

    print(f'bootstrap_servers: {kafka_brokers} subscribing to {topic}')
    consumer.subscribe([topic])

    for message in consumer:
        print(f"message is of type: {type(message)}")
        print(message)

        do_something_time_consuming()

        meta = consumer.partitions_for_topic(message.topic)

        partition = TopicPartition(message.topic, message.partition)
        offsets = OffsetAndMetadata(message.offset + 1, meta)
        options = {partition: offsets}

        print(f'\noptions: {options}\n')

        response = consumer.commit(offsets=options)
    def start(self):
        for msg in self.consumer:
            logging.info(msg)
            value = msg.value
            topic_partition = TopicPartition(self.kafka_consumer_topic, msg.partition)
            offset = OffsetAndMetadata(msg.offset, '')

            message_id = value['message_id']
            message_payload = value['payload']

            try:
                start_execution_datetime = str(datetime.utcnow())
                time_before_face_location = current_milli_time()
                process_image_result = self.message_processing.process_message(message_payload)
                time_after_face_location = current_milli_time()

                if process_image_result is not None:

                    response_message = {
                        'payload': process_image_result,
                        'message_id': str(uuid.uuid4()),
                        'consumer_id': self.consumer_id,
                        'host': get_hostname(),
                        'execution_time_ms': time_after_face_location - time_before_face_location,
                        'start_execution_datetime': start_execution_datetime
                    }
                    self.consumer.commit(offsets={topic_partition: offset})

                    future_send = self.producer.send(self.kafka_response_topic, response_message)
                    future_send.get(2 * 60)
                    logging.info('Save ')
                else:
                    logging.error("Can't process message '%s' on consumer '%s', skip it", message_id, self.consumer_id)
            except Exception as e:
                logging.error('Exception ', e)
    def _run(self):
        consumer = KafkaConsumer(self.kafka_face_recognise_result_topic,
                                 bootstrap_servers=[self.kafka_host],
                                 auto_offset_reset='earliest',
                                 enable_auto_commit=False,
                                 group_id='recognised_faces',
                                 value_deserializer=kafka_json_deserializer,
                                 max_poll_records=20)
        db_image_service = DbImageService(self.db_uri, self.db_image_name)
        for msg in consumer:
            message_v = msg.value
            message_payload = message_v['payload']

            crop_face_image_id = message_payload['db_id']
            crop_face_image_document = db_image_service.find_crop_face_by_id(
                crop_face_image_id)

            if crop_face_image_document:
                recognised_faces = message_payload['recognised_faces']
                db_image_service.update_crop_face_recognise(
                    crop_face_image_id, recognised_faces)
            else:
                logging.warning(
                    "Can't find crop face image by %s",
                    str({'_id': ObjectId(message_payload['db_id'])}))

            topic_partition = TopicPartition(
                self.kafka_face_recognise_result_topic, msg.partition)
            offset = OffsetAndMetadata(msg.offset, '')
            consumer.commit(offsets={topic_partition: offset})
            logging.info("after commit %s", message_v['consumer_id'])
Exemple #7
0
def main():
    consumer = KafkaConsumer(bootstrap_servers=["worker2.hengan.shop:9092"],group_id='me',
                             auto_offset_reset = 'earliest',
                             #value_deserializer = lambda m: json.loads(m.decode('utf-8')),
                             enable_auto_commit=False)
    #consumer.assign([TopicPartition('foobar2',0)])
    consumer.subscribe(['foobar2'])
    #consumer.seek(TopicPartition('foobar2',0),100)
    
    print(consumer.topics())
    print(consumer.subscription())
    ret = consumer.poll()
    print(ret)
    print(consumer.assignment())
    #consumer.seek_to_beginning()
    try:
        for message in consumer:
            print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value))
            print(consumer.partitions_for_topic('foobar2'))
            print("offset is %d" % message.offset)
            tp1 = TopicPartition(topic="foobar2",partition=0)
            om = OffsetAndMetadata(offset=message.offset+1,metadata=1)
            consumer.commit({tp1:om})
            break
    except KeyboardInterrupt:
        sys.exit()    
Exemple #8
0
    def success(self, model):
        from kafka import TopicPartition, OffsetAndMetadata

        kafka_msg = model.raw_message
        self._consumer.commit({
            TopicPartition(kafka_msg.topic, kafka_msg.partition):
            OffsetAndMetadata(kafka_msg.offset + 1, "")
        })
    def reset_offset(self, reset_offset_value):
        partitions_offset = {}
        for partition_id in self.consumer.partitions_for_topic(self.topic):
            partitions_offset[TopicPartition(
                topic,
                partition_id)] = OffsetAndMetadata(reset_offset_value, '')

        self.consumer.commit(partitions_offset)
    def _run(self):
        consumer = KafkaConsumer(self.kafka_image_recognised_result_topic,
                                 bootstrap_servers=[self.kafka_host],
                                 auto_offset_reset='earliest',
                                 enable_auto_commit=False,
                                 group_id='recognised_faces',
                                 value_deserializer=kafka_json_deserializer,
                                 max_poll_records=20)
        db_image_service = DbImageService(self.db_uri, self.db_image_name)
        for msg in consumer:
            message_v = msg.value
            message_payload = message_v['payload']
            recognised_objects = message_payload['recognised_objects']
            logging.info('Receive message - consumer-id: %s',
                         message_v['consumer_id'])
            image_document = db_image_service.find_image_by_id_path(
                message_payload['db_id'], message_payload['original_path'])
            if image_document:
                db_image_service.update_image_faces_process_step(
                    message_payload['db_id'], message_payload['original_path'],
                    recognised_objects, message_payload['timestamp'])

                cropped_faces = []
                for r in recognised_objects:
                    for c in r['cropped_faces']:
                        cropped_faces.append({
                            'image_id':
                            image_document['_id'],
                            'crop_face_image_path':
                            c['face_image_path'],
                            'crop_face_id':
                            c['crop_face_id'],
                            'face_location':
                            c['face_location'],
                            'face_recognised':
                            False,
                            'face_recognised_in_queue':
                            False,
                            'recognised':
                            None
                        })

                for c in cropped_faces:
                    db_image_service.insert_crop_face_image(c)

            else:
                logging.warning(
                    "Can't find image by %s",
                    str({
                        '_id': ObjectId(message_payload['db_id']),
                        'path': message_payload['original_path']
                    }))

            topic_partition = TopicPartition(
                self.kafka_image_recognised_result_topic, msg.partition)
            offset = OffsetAndMetadata(msg.offset, '')
            consumer.commit(offsets={topic_partition: offset})
            logging.info("after commit %s", message_v['consumer_id'])
Exemple #11
0
def sync_stream(kafka_config, stream, state):
    consumer = KafkaConsumer(
        kafka_config['topic'],
        group_id=kafka_config['group_id'],
        enable_auto_commit=False,
        consumer_timeout_ms=kafka_config.get('consumer_timeout_ms', 10000),
        auto_offset_reset='earliest',
        value_deserializer=lambda m: json.loads(m.decode('ascii')),
        bootstrap_servers=kafka_config['bootstrap_servers'])

    send_schema_message(stream)
    stream_version = singer.get_bookmark(state, stream['tap_stream_id'],
                                         'version')
    if stream_version is None:
        stream_version = int(time.time() * 1000)

    state = singer.write_bookmark(state, stream['tap_stream_id'], 'version',
                                  stream_version)
    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
    activate_version_message = singer.ActivateVersionMessage(
        stream=stream['tap_stream_id'], version=stream_version)

    singer.write_message(activate_version_message)

    time_extracted = utils.now()
    rows_saved = 0
    for message in consumer:
        LOGGER.info("%s:%s:%s: key=%s value=%s" %
                    (message.topic, message.partition, message.offset,
                     message.key, message.value))
        # stream['schema']
        record = singer.RecordMessage(stream=stream['tap_stream_id'],
                                      record=message.value,
                                      time_extracted=time_extracted)

        [valid, error] = validate_record(stream['schema'], record)
        rows_saved = rows_saved + 1

        if valid:
            singer.write_message(record)
        elif kafka_config.get('reject_topic'):
            send_reject_message(kafka_config, record, error)
        else:
            raise Exception(
                "record failed validation and no reject_topic was specified")

        state = singer.write_bookmark(state, stream['tap_stream_id'], 'offset',
                                      message.offset)

        #commit offsets because we processed the message
        tp = TopicPartition(message.topic, message.partition)
        consumer.commit({tp: OffsetAndMetadata(message.offset + 1, None)})

        if rows_saved % UPDATE_BOOKMARK_PERIOD == 0:
            singer.write_message(
                singer.StateMessage(value=copy.deepcopy(state)))

    singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
Exemple #12
0
 def get_cons_offsets(topic, cons):
     """
     Get current committed consumer group offsets for topic.
     """
     partitions_for_topic = cons.partitions_for_topic(topic)
     partitions = [TopicPartition(topic, i) for i in partitions_for_topic]
     committed_offsets = {
         partition: OffsetAndMetadata(cons.committed(partition), None)
         for partition in partitions
     }
     return committed_offsets
Exemple #13
0
def teardown_kafka_reader(q, positions, _setup=None):
    q.cancel_join_thread()
    cons = _setup['consumer']
    offsets = dict([(tp,
                     OffsetAndMetadata(offset=positions[tp.partition],
                                       metadata=''))
                    for tp in cons.assignment()])
    cons.commit(offsets)
    cons.close(autocommit=False)
    logger.info("exit kafka reader process")
    raise OpExit("exit kafka reader process")
Exemple #14
0
    def reset_offset(self, reset_offset_value):
        # Trigger Rebalance
        self.consumer.subscribe(self.topic)
        self.consumer.poll(0)
        # Reset Offset
        partitions_offset = {}
        for partition_id in self.consumer.partitions_for_topic(self.topic):
            partitions_offset[TopicPartition(
                topic,
                partition_id)] = OffsetAndMetadata(reset_offset_value, '')

        self.consumer.commit(partitions_offset)
 def success_result(self, msg, consumer, result):
     """
      :param msg: ConsumerRecord
      :param consumer: KafkaConsumer
      :param result: Unwrapperd success result
      :return:
      """
     if not self.enable_auto_commit:
         options = {
             TopicPartition(msg.topic, msg.partition):
             OffsetAndMetadata(msg.offset + 1, None)
         }
         consumer.commit(options)
Exemple #16
0
    def _commit_offsets(self, topic_partition: TopicPartition, offset: int):
        """Commits offsets for the partition of a given topic.

        This effectively advances the index so that future reads from the same Kafka consumer group will not read any
        records up to that offset.

        :param topic_partition: Partition of the topic where offsets are to be committed.
        :param offset: Largest offset read so far.
        :return:
        """

        self._consumer.commit({
            topic_partition: OffsetAndMetadata(offset=offset + 1, metadata=''),
        })
Exemple #17
0
    def _commit_offsets(self, partition):
        """Commit kafka consumer group offsets.
        """
        commit_offsets = {}
        to_delete = []
        for tp, offset in six.iteritems(self._consumer_offsets):
            if tp.partition == partition:
                commit_offsets[tp] = OffsetAndMetadata(offset, None)
                to_delete.append(tp)

        for tp in to_delete:
            del self._consumer_offsets[tp]

        self._consumer.commit(commit_offsets)
Exemple #18
0
def get_failed_events(config, from_topic, to_topic):
    consumer = KafkaConsumer(
        bootstrap_servers=config['kafka']['hosts'],
        auto_offset_reset='earliest',
        enable_auto_commit=True,
        auto_commit_interval_ms=1000,
        group_id=to_topic
    )

    n_partitions = len(consumer.partitions_for_topic(from_topic))
    events_to_publish = list()
    commit_options = dict()

    for partition_idx in range(n_partitions):
        partition = TopicPartition(from_topic, partition_idx)
        consumer.assign([partition])

        # we'll start reading from this position
        from_offset = consumer.position(partition)

        # obtain the last offset value
        consumer.seek_to_end(partition)
        to_offset = consumer.position(partition)

        print(f'partition_idx: {partition_idx}, from_offset: {from_offset}, to_offset: {to_offset}')
        # no new events since last replay
        if from_offset >= to_offset:
            continue

        consumer.seek(partition, from_offset)

        for message in consumer:
            event = str(message.value, 'utf-8')
            events_to_publish.append(event)
            if message.offset >= to_offset - 1:
                """
                from kafka-python team on github (https://github.com/dpkp/kafka-python/issues/645):
                    "the metadata is really just an opaque string. You can also pass None. 
                    Nothing uses metadata internally, it is there as a way for you to s
                    tore application-specific data if needed." 
                """
                commit_options[partition] = OffsetAndMetadata(message.offset + 1, None)
                break

    consumer.commit(commit_options)
    return events_to_publish
Exemple #19
0
    def _method(topic):
        consumer = KafkaConsumer(
            bootstrap_servers=app.config['KAFKA_URL'],
            group_id='testing',
            key_deserializer=bytes.decode,
            value_deserializer=lambda v: json.loads(v.decode('utf-8')),
            auto_offset_reset='latest',
            enable_auto_commit=False)

        partition = TopicPartition(topic, 0)
        consumer.assign([partition])
        last_pos = consumer.end_offsets([partition])
        pos = last_pos[partition]
        offset = OffsetAndMetadata(pos - 1, b'')
        consumer.commit(offsets={partition: offset})
        msg = next(consumer)
        consumer.close()
        return msg
Exemple #20
0
 def handler():
     consumer = None
     topics = [topic] if isinstance(topic, str) else topic.copy()
     try:
         consumer = KafkaConsumer(*topics, **conf)
         for message in consumer:
             func(message.key, message.value, message.topic,
                  message.partition, message.offset)
             # 控制 kafka topic 偏移量,
             if 'enable_auto_commit' in conf and not conf[
                     'enable_auto_commit']:
                 consumer.commit_async({
                     TopicPartition(message.topic, message.partition):
                     OffsetAndMetadata(message.offset, None)
                 })
     finally:
         if consumer:
             consumer.close()
Exemple #21
0
    def seek_offset(self, topic_partition: TopicPartition, offset: int) -> None:
        """Seek the provided partition for a configured consumer group to a specific offset.

        Arguments:
            topic_partition (kafka.structs.TopicPartition): Non-localized topic partition.
            offset (int): Desired offset.

        """
        local_tp = self._get_localized_tp(topic_partition)
        if self._dry_run:
            logger.debug(
                "dry_run mode: Attempted to commit on %s:%s to offset %s.",
                local_tp.topic,
                local_tp.partition,
                offset,
            )
        else:
            self._consumer.assign([local_tp])
            self._consumer.commit({local_tp: OffsetAndMetadata(offset, None)})
    def consume(self) -> bool:
        data = list()
        messages = self._consumer.poll(100, 10000)
        if messages:
            # TODO: Only works if there is a single partition per consumer. As soon as the number of consumers is lower
            # TODO: or higher than the number of partitions this fails.
            for message in messages[self._consumer.assignment().pop()]:
                key = message.key.decode('utf-8')
                try:
                    value = json.loads(message.value.decode('utf-8'))
                except JSONDecodeError as ex:
                    self._error_logger.error(
                        "Failed to JSONDecode message: {}.".format(
                            message.value.decode('utf-8')))
                    value = {
                        'message': message.value.decode('utf-8'),
                        'error': '{}'.format(ex)
                    }
                if self._key not in value:
                    value['_key'] = key
                data.append(value)
        now = time.time()
        if len(data) > 0:
            result = self._index.bulk(data,
                                      self._key,
                                      op_type=self.configuration.op_type,
                                      upsert=self.configuration.upsert)
            then = time.time()
            amount = then - now
            self._time_logger.info(
                "Success! Indexed {} messages to {} in {} seconds.".format(
                    len(data), self._index.index, amount))
        else:
            result = False

        if result:
            for assignment in self._consumer.assignment():
                pos = self._consumer.position(assignment)
                if pos != self._consumer.committed(assignment):
                    self._consumer.commit(
                        {assignment: OffsetAndMetadata(pos, "")})

        return result
Exemple #23
0
 def commit(self, partition_offset: tuple, async_commit: bool = False):
     """
     提交偏移量至kafka, 阻塞直到成功或报错
     need group id not None
     :param partition_offset: (topic, partition, offset)
     :param async_commit: choose async commit
     :return:
     """
     topic = partition_offset[0]
     partition = partition_offset[1]
     _offset = partition_offset[2]
     offset = {
         TopicPartition(topic, partition): OffsetAndMetadata(_offset, None)
     }
     if not async_commit:
         self.consumer.commit(offset)
     else:
         self.consumer.commit_async(offset).add_errback(self.commit_err,
                                                        topic=topic,
                                                        partition=partition,
                                                        offset=_offset)
Exemple #24
0
    def _update_tasks_status(self):
        offsets = {}
        topics_to_resume = []

        for topic, task in self._runningTasks.items():
            if task.get_offset() > 0:
                for partition in self._internal_consumer.partitions_for_topic(
                        topic):
                    topic_partition = TopicPartition(topic, partition)
                    offsets[topic_partition] = OffsetAndMetadata(
                        task.get_offset(), topic_partition)
            if not task.is_running():
                for partition in self._internal_consumer.partitions_for_topic(
                        topic):
                    topic_partition = TopicPartition(topic, partition)
                    topics_to_resume.append(topic_partition)

        self._internal_consumer.commit_async(offsets)
        for topic_partition in topics_to_resume:
            self._runningTasks.pop(topic_partition.topic)
            self._internal_consumer.resume(topic_partition)
Exemple #25
0
    def consumer(self, topic, msg_handler, save_handler):
        bootstrap_servers = '%s:%s' % (self.config.kafka_host,
                                       self.config.kafka_port)
        t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
        INFO(
            f"{t} start run consumer, topic: {topic}, group_id: {self.config.group_id}, bootstrap_servers: {bootstrap_servers}"
        )
        consumer = KafkaConsumer(bootstrap_servers=bootstrap_servers,
                                 group_id=self.config.group_id,
                                 auto_offset_reset='earliest',
                                 enable_auto_commit=False)
        tp = TopicPartition(topic=topic, partition=0)
        consumer.assign([tp])

        now_offset = 0
        last_offset = 0
        datas = []

        for msg in consumer:
            now_offset = msg.offset
            data = msg_handler(msg.value)
            if data:
                datas.append(data)
            if len(datas) >= self.save_cnt:
                try:
                    success = save_handler(datas)
                    INFO(f"save_handler result is:{success}")
                    if success:
                        datas = []
                        consumer.commit(offsets={
                            tp: (OffsetAndMetadata(now_offset + 1, 0))
                        })
                        last_offset = msg.offset
                    else:
                        ERROR(
                            f"save data failed, topic: {topic}, old_offset: {last_offset}, now_offset: {now_offset}"
                        )
                except Exception as e:
                    ERROR(f"commit consumer offset failed,Exception:{e}")
    def commit(self, partition, offset):
        """Commit the given offset for the given partition - indicates that the
        messages until this offset have been successfully replayed to the target postgres
        instance

        Arguments:
            partition (int): kafka topic partition
            offset (int): offset until which processing has been successful
        """
        if offset is None:
            return
        try:
            self.consumer.commit({
                TopicPartition(self.topic, partition):
                OffsetAndMetadata(offset + 1, None)
            })
        except Exception as e:
            self.logger.error(
                'Cannot commit offset {} for topic:partition {}:{}. Error: {}'.
                format(offset, self.topic, partition, e.message))
            self.metrics.measure(
                KafkaErrorsMeasurement(self.topic, 'CommitError'))
            raise e
def hgweb():
    '''hgweb component of the vcsreplicator bootstrap procedure. Takes a
    vcsreplicator config path on the CLI and takes a JSON data structure
    on stdin'''
    import argparse

    # Parse CLI args
    parser = argparse.ArgumentParser()
    parser.add_argument('config', help='Path of config file to load')
    parser.add_argument(
        'input',
        help=
        'JSON data input (output from the hgssh bootstrap procedure) file path'
    )
    parser.add_argument(
        '--workers',
        help='Number of concurrent workers to use for performing clones',
        type=int,
        default=multiprocessing.cpu_count())
    args = parser.parse_args()

    logger.info('reading hgssh JSON document')
    with open(args.input, 'r') as f:
        hgssh_data = json.loads(f.read())
        logger.info('JSON document read')

    # Convert the JSON keys to integers
    hgssh_data['offsets'] = {
        int(k): v
        for k, v in hgssh_data['offsets'].items()
    }

    config = Config(filename=args.config)

    consumer_config = {
        # set this so offsets are committed to Zookeeper
        'api_version': (0, 8, 1),
        'bootstrap_servers': config.c.get('consumer', 'hosts'),
        'client_id': config.c.get('consumer', 'client_id'),
        'enable_auto_commit': False,
        'group_id': config.c.get('consumer', 'group'),
        'max_partition_fetch_bytes': MAX_BUFFER_SIZE,
        'value_deserializer': value_deserializer,
    }

    topic = config.c.get('consumer', 'topic')

    topicpartitions = [
        TopicPartition(topic, partition)
        for partition, (start_offset,
                        end_offset) in sorted(hgssh_data['offsets'].items())
        # there is no need to do an assignment if the length of the
        # bootstrap message range is 0
        if start_offset != end_offset
    ]

    consumer = KafkaConsumer(**consumer_config)

    outputdata = collections.defaultdict(list)

    # We will remove repos from this set as we replicate them
    # Once this is an empty set we are done
    repositories_to_clone = set()
    for repo in hgssh_data['repositories']:
        filterresult = config.filter(repo)

        if filterresult.passes_filter:
            repositories_to_clone.add(repo)
        else:
            outputdata[repo].append('filtered by rule %s' % filterresult.rule)

    extra_messages = collections.defaultdict(
        collections.deque)  # maps repo names to extra processing messages
    clone_futures_repo_mapping = {}  # maps cloning futures to repo names
    extra_messages_futures_repo_mapping = {
    }  # maps extra messages futures to repo names

    # Overwrite default hglib path so handle_message_main and it's derivatives
    # use the correct virtualenv
    hglib.HGPATH = config.c.get('programs', 'hg')

    # Maps partitions to the list of messages within the bootstrap range
    aggregate_messages_by_topicpartition = {
        tp.partition: []
        for tp in topicpartitions
    }

    # Gather all the Kafka messages within the bootstrap range for each partition
    for topicpartition in topicpartitions:
        start_offset, end_offset = hgssh_data['offsets'][
            topicpartition.partition]

        end_offset -= 1

        # Assign the consumer to the next partition and move to the start offset
        logger.info('assigning the consumer to partition %s' %
                    topicpartition.partition)
        consumer.assign([topicpartition])

        logger.info('seeking the consumer to offset %s' % start_offset)
        consumer.seek(topicpartition, start_offset)
        consumer.commit(
            offsets={topicpartition: OffsetAndMetadata(start_offset, '')})

        logger.info(
            'partition %s of topic %s moved to offset %s' %
            (topicpartition.partition, topicpartition.topic, start_offset))

        # Get all the messages we need to process from kafka
        for message in consumer:
            # Check if the message we are processing is within the range of accepted messages
            # If we are in the range, add this message to the list of messages on this partition
            # If we are at the end of the range, break from the loop and move on to the next partition
            if message.offset <= end_offset:
                aggregate_messages_by_topicpartition[message.partition].append(
                    message)
                logger.info(
                    'message on partition %s, offset %s has been collected' %
                    (message.partition, message.offset))

            consumer.commit(
                offsets={
                    TopicPartition(topic, message.partition):
                    OffsetAndMetadata(message.offset + 1, ''),
                })

            if message.offset >= end_offset:
                logger.info('finished retrieving messages on partition %s' %
                            message.partition)
                break

    logger.info('finished retrieving messages from Kafka')

    # Process the previously collected messages
    with futures.ThreadPoolExecutor(args.workers) as e:
        for partition, messages in sorted(
                aggregate_messages_by_topicpartition.items()):
            logger.info('processing messages for partition %s' % partition)
            for message in messages:
                payload = message.value

                # Ignore heartbeat messages
                if payload['name'] == 'heartbeat-1':
                    continue

                if payload['path'] in repositories_to_clone:
                    # If we have not yet replicated the repository for this message,
                    # of the repo sync message is not tagged with the bootstrap flag,
                    # move on to the next message. The assumed upcoming hg-repo-sync-2
                    # message will clone the data represented in this message anyways.
                    if payload['name'] != 'hg-repo-sync-2' or not payload[
                            'bootstrap']:
                        continue

                    logger.info('scheduled clone for %s' % payload['path'])

                    # Schedule the repo sync
                    clone_future = e.submit(clone_repo, config,
                                            payload['path'],
                                            payload['requirements'],
                                            payload['hgrc'], payload['heads'])

                    # Here we register the future against its repo name
                    clone_futures_repo_mapping[clone_future] = payload['path']

                    # Remove the repo from the set of repos
                    # which have not been scheduled to sync
                    repositories_to_clone.remove(payload['path'])
                elif payload['path'] not in outputdata:
                    # If the repo is not in the list of repositories to clone,
                    # and the repo is not in the outputdata object (ie hasn't
                    # errored out, by being filtered or otherwise),
                    # then we have already scheduled the repo sync and we will
                    # need to process this message once the sync completes.
                    extra_messages[payload['path']].append((config, payload))
                    logger.info('extra messages found for %s: %s total' %
                                (payload['path'],
                                 len(extra_messages[payload['path']])))

        if repositories_to_clone:
            logger.error('did not receive expected sync messages for %s' %
                         repositories_to_clone)

            # Add errors to audit output
            for repo in repositories_to_clone:
                outputdata[repo].append('did not receive sync message')

        # Process clones
        remaining_clones = len(clone_futures_repo_mapping)
        for completed_future in futures.as_completed(
                clone_futures_repo_mapping):
            repo = clone_futures_repo_mapping[completed_future]

            exc = completed_future.exception()
            if exc:
                message = 'error triggering replication of Mercurial repo %s: %s' % (
                    repo, str(exc))
                logger.error(message)

                # Add error to audit output
                outputdata[repo].append(message)
            else:
                logger.info('%s successfully cloned' % repo)

            remaining_clones -= 1

            logger.info('%s repositories remaining' % remaining_clones)

            # Schedule extra message processing if necessary
            if repo in extra_messages:
                logger.info('scheduling extra processing for %s' % repo)
                configs, payloads = zip(*extra_messages[repo])
                future = e.submit(map, handle_message_main, configs, payloads)
                extra_messages_futures_repo_mapping[future] = repo

        # Process extra messages
        total_message_batches = len(extra_messages_futures_repo_mapping)
        for completed_future in futures.as_completed(
                extra_messages_futures_repo_mapping):
            repo = extra_messages_futures_repo_mapping[completed_future]

            exc = completed_future.exception()
            if exc:
                message = 'error processing extra messages for %s: %s' % (
                    repo, str(exc))
                logger.error(message)

                # Add error to audit output
                outputdata[repo].append(message)
            else:
                logger.info('extra processing for %s completed successfully' %
                            repo)

            total_message_batches -= 1
            logger.info('%s batches remaining' % total_message_batches)

    logger.info('%s bootstrap process complete' %
                config.c.get('consumer', 'group'))

    # If anything broke, dump the errors and set exit code 1
    if outputdata:
        with open('/repo/hg/hgweb_bootstrap_out.json', 'w') as f:
            f.write(json.dumps(outputdata))
        return 1
def Consumer(thread_name, topic, partition):
    print(
        thread_name,
        "Starting\tDispose",
    )
    global is_dispose
    broker_list = '172.16.90.63:6667, 172.16.90.58:6667, 172.16.90.59:6667'
    '''
    fetch_min_bytes(int) - 服务器为获取请求而返回的最小数据量,否则请等待
    fetch_max_wait_ms(int) - 如果没有足够的数据立即满足fetch_min_bytes给出的要求,服务器在回应提取请求之前将阻塞的最大时间量(以毫秒为单位)
    fetch_max_bytes(int) - 服务器应为获取请求返回的最大数据量。这不是绝对最大值,如果获取的第一个非空分区中的第一条消息大于此值,
                            则仍将返回消息以确保消费者可以取得进展。注意:使用者并行执行对多个代理的提取,因此内存使用将取决于包含该主题分区的代理的数量。
                            支持的Kafka版本> = 0.10.1.0。默认值:52428800(50 MB)。
    enable_auto_commit(bool) - 如果为True,则消费者的偏移量将在后台定期提交。默认值:True。
    max_poll_records(int) - 单次调用中返回的最大记录数poll()。默认值:500
    max_poll_interval_ms(int) - poll()使用使用者组管理时的调用之间的最大延迟 。这为消费者在获取更多记录之前可以闲置的时间量设置了上限。
                                如果 poll()在此超时到期之前未调用,则认为使用者失败,并且该组将重新平衡以便将分区重新分配给另一个成员。默认300000
    '''
    consumer = KafkaConsumer(
        bootstrap_servers=broker_list,
        group_id="xiaofei",
        client_id=thread_name,
        # auto_offset_reset="smallest",
        enable_auto_commit=False,
        fetch_min_bytes=1024 * 1024,
        # fetch_max_bytes=1024 * 1024 * 1024 * 10,
        fetch_max_wait_ms=60000,
        request_timeout_ms=305000,
        # consumer_timeout_ms=1,
        # max_poll_records=5000,
        # max_poll_interval_ms=60000 无该参数
    )
    dic = get_kafka(topic, partition)

    tp = TopicPartition(topic, partition)
    # print(thread_name, tp, dic['offset'])
    consumer.assign([tp])
    # 重定向分区offset
    consumer.seek(tp, dic['offset'])
    print("程序首次运行\t线程:", thread_name, "分区:", partition, "偏移量:", dic['offset'],
          "\t开始消费...")
    num = 0
    # end_offset = consumer.end_offsets([tp])[tp]
    # print(end_offset)
    while True:
        args = OrderedDict()
        checkThread()
        msg = consumer.poll(timeout_ms=60000)
        end_offset = consumer.end_offsets([tp])[tp]
        print('已保存的偏移量', consumer.committed(tp), '最新偏移量,', end_offset)
        # 测试线程死掉
        # if thread_name=="Thread-1" and num==2:
        #     sys.exit()
        if len(thread_msg) > 0 and is_dispose is True:
            is_dispose = False
            for msg_send in thread_msg:
                exp(msg_send)
                send_msg(msg_send)
            thread_msg.clear()
        if len(msg) > 0:
            print("线程:", thread_name, "分区:", partition, "最大偏移量:", end_offset,
                  "有无数据,", len(msg))
            lines = 0
            for data in msg.values():
                for line in data:
                    lines += 1
                    line = eval(line.value.decode('utf-8'))
                    value, log_name = get_line(col_dic, line)
                    sql = sql_dic[log_name]
                    if value is not None:
                        args.setdefault(sql, []).append(tuple(value))
            print(thread_name, "处理条数", lines)
            # 数据保存至数据库
            is_succeed = save_to_db(args, thread_name)
            if is_succeed:
                # 更新保存在数据库中的分区的偏移量
                is_succeed1 = update_offset(topic, partition, end_offset)
                # 手动提交偏移量到kafka
                consumer.commit(
                    offsets={tp: (OffsetAndMetadata(end_offset, None))})
                # print(thread_name,"to db suss",num+1)
                if is_succeed1 == 0:
                    sys.exit()
            else:
                sys.exit()
        else:
            pass
            # print(thread_name,'没有数据')
        # time.sleep(60)
        num += 1
Exemple #29
0
def test_commit(consumer):
    partition = TopicPartition('test', 2)
    offset_metadata = OffsetAndMetadata(2, 'xx')
    response = consumer.commit({partition: offset_metadata})
    print(response)
Exemple #30
0
args = {
    # "security_protocol": "SSL",
    # "ssl_cafile": "/Users/yixiang/Projects/ISI/mydig-projects/dig3_ht/kafka_ssl/ca-cert.pem",
    # "ssl_certfile": "/Users/yixiang/Projects/ISI/mydig-projects/dig3_ht/kafka_ssl/client-cert.pem",
    # "ssl_keyfile": "/Users/yixiang/Projects/ISI/mydig-projects/dig3_ht/kafka_ssl/client-key.pem",
    # "ssl_check_hostname": False
}


part_num = 1
group_id = 'dig_test'
topic_id = 'test'

consumer = KafkaConsumer(
    bootstrap_servers=broker_list,
    group_id=group_id,
    **args
)

meta = consumer.partitions_for_topic(topic_id)
assigned_parts = []
for i in range(part_num):
    assigned_parts.append(TopicPartition(topic_id, i))
consumer.assign(assigned_parts)
for p in assigned_parts:
    consumer.seek(p, 0)
    # sometimes it is blocked, need to restart
    consumer.commit({p:OffsetAndMetadata(0, meta)})

print 'done'