def get_some_message(self,num,start_offset=None): #start_offset: 为None则以该组在kafka server上的偏移量开始拉去 #message_num: 需要获取的消息量 #groupid : consumer组信息 #client_id: consumer client信息 #topic: 消费的topic #partition 消费的partition #bootstrap_servers: kafka服务器信息(localhost:9092) if start_offset != None: self.consumer.seek(self.topic_partition, start_offset) self.consumer.commit( offsets={self.topic_partition: OffsetAndMetadata(start_offset, None)}) res = self.consumer.poll(timeout_ms=5000, max_records=num) if res != {}: for i in res.values(): for j in i: print(j) self.message.append(j.value) a = len(i) old_offset = self.consumer.committed(self.topic_partition) print(old_offset) new_offset = a + old_offset self.consumer.commit( offsets={self.topic_partition: OffsetAndMetadata(new_offset, None)}) else: a=0 return dict({'message_num':a,'message_list':self.message})
def test_convert_partition_offsets_translates_partition_offsets_to_committable_topic_offsets( ): offsets = convert_partition_offsets('foo', {0: 100, 1: 200}) assert offsets == { TopicPartition(topic='foo', partition=0): OffsetAndMetadata(offset=100, metadata=''), TopicPartition(topic='foo', partition=1): OffsetAndMetadata(offset=200, metadata='') }
def consume(self) -> bool: """ Consumes a single message from the subscribed topic and indexes it into the elasticsearch index. Returns True if successful, False otherwise. """ message = next(self._consumer) key = message.key.decode('utf-8') try: value = json.loads(message.value.decode('utf-8')) except JSONDecodeError as ex: value = { 'message': message.value.decode('utf-8'), 'error': '{}'.format(ex) } result = self._index.index_into(value, key) if result: for assignment in self._consumer.assignment(): pos = self._consumer.position(assignment) if pos != self._consumer.committed(assignment): self._consumer.commit( {assignment: OffsetAndMetadata(pos, "")}) # self._time_logger.info("Consumed and indexed one message.") return result
def consume_one_message_at_a_time(conf): kafka_brokers = conf.KAFKA_BROKERS topic = conf.demo_topic group_id = f'{conf.demo_group_id}_2' print( f'KAFKA_BROKERS: {kafka_brokers}\n Topic {topic}\n group id: {group_id}' ) consumer = KafkaConsumer(topic, bootstrap_servers=kafka_brokers, group_id=group_id, enable_auto_commit=False, max_poll_records=1) print(f'bootstrap_servers: {kafka_brokers} subscribing to {topic}') consumer.subscribe([topic]) for message in consumer: print(f"message is of type: {type(message)}") print(message) do_something_time_consuming() meta = consumer.partitions_for_topic(message.topic) partition = TopicPartition(message.topic, message.partition) offsets = OffsetAndMetadata(message.offset + 1, meta) options = {partition: offsets} print(f'\noptions: {options}\n') response = consumer.commit(offsets=options)
def start(self): for msg in self.consumer: logging.info(msg) value = msg.value topic_partition = TopicPartition(self.kafka_consumer_topic, msg.partition) offset = OffsetAndMetadata(msg.offset, '') message_id = value['message_id'] message_payload = value['payload'] try: start_execution_datetime = str(datetime.utcnow()) time_before_face_location = current_milli_time() process_image_result = self.message_processing.process_message(message_payload) time_after_face_location = current_milli_time() if process_image_result is not None: response_message = { 'payload': process_image_result, 'message_id': str(uuid.uuid4()), 'consumer_id': self.consumer_id, 'host': get_hostname(), 'execution_time_ms': time_after_face_location - time_before_face_location, 'start_execution_datetime': start_execution_datetime } self.consumer.commit(offsets={topic_partition: offset}) future_send = self.producer.send(self.kafka_response_topic, response_message) future_send.get(2 * 60) logging.info('Save ') else: logging.error("Can't process message '%s' on consumer '%s', skip it", message_id, self.consumer_id) except Exception as e: logging.error('Exception ', e)
def _run(self): consumer = KafkaConsumer(self.kafka_face_recognise_result_topic, bootstrap_servers=[self.kafka_host], auto_offset_reset='earliest', enable_auto_commit=False, group_id='recognised_faces', value_deserializer=kafka_json_deserializer, max_poll_records=20) db_image_service = DbImageService(self.db_uri, self.db_image_name) for msg in consumer: message_v = msg.value message_payload = message_v['payload'] crop_face_image_id = message_payload['db_id'] crop_face_image_document = db_image_service.find_crop_face_by_id( crop_face_image_id) if crop_face_image_document: recognised_faces = message_payload['recognised_faces'] db_image_service.update_crop_face_recognise( crop_face_image_id, recognised_faces) else: logging.warning( "Can't find crop face image by %s", str({'_id': ObjectId(message_payload['db_id'])})) topic_partition = TopicPartition( self.kafka_face_recognise_result_topic, msg.partition) offset = OffsetAndMetadata(msg.offset, '') consumer.commit(offsets={topic_partition: offset}) logging.info("after commit %s", message_v['consumer_id'])
def main(): consumer = KafkaConsumer(bootstrap_servers=["worker2.hengan.shop:9092"],group_id='me', auto_offset_reset = 'earliest', #value_deserializer = lambda m: json.loads(m.decode('utf-8')), enable_auto_commit=False) #consumer.assign([TopicPartition('foobar2',0)]) consumer.subscribe(['foobar2']) #consumer.seek(TopicPartition('foobar2',0),100) print(consumer.topics()) print(consumer.subscription()) ret = consumer.poll() print(ret) print(consumer.assignment()) #consumer.seek_to_beginning() try: for message in consumer: print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value)) print(consumer.partitions_for_topic('foobar2')) print("offset is %d" % message.offset) tp1 = TopicPartition(topic="foobar2",partition=0) om = OffsetAndMetadata(offset=message.offset+1,metadata=1) consumer.commit({tp1:om}) break except KeyboardInterrupt: sys.exit()
def success(self, model): from kafka import TopicPartition, OffsetAndMetadata kafka_msg = model.raw_message self._consumer.commit({ TopicPartition(kafka_msg.topic, kafka_msg.partition): OffsetAndMetadata(kafka_msg.offset + 1, "") })
def reset_offset(self, reset_offset_value): partitions_offset = {} for partition_id in self.consumer.partitions_for_topic(self.topic): partitions_offset[TopicPartition( topic, partition_id)] = OffsetAndMetadata(reset_offset_value, '') self.consumer.commit(partitions_offset)
def _run(self): consumer = KafkaConsumer(self.kafka_image_recognised_result_topic, bootstrap_servers=[self.kafka_host], auto_offset_reset='earliest', enable_auto_commit=False, group_id='recognised_faces', value_deserializer=kafka_json_deserializer, max_poll_records=20) db_image_service = DbImageService(self.db_uri, self.db_image_name) for msg in consumer: message_v = msg.value message_payload = message_v['payload'] recognised_objects = message_payload['recognised_objects'] logging.info('Receive message - consumer-id: %s', message_v['consumer_id']) image_document = db_image_service.find_image_by_id_path( message_payload['db_id'], message_payload['original_path']) if image_document: db_image_service.update_image_faces_process_step( message_payload['db_id'], message_payload['original_path'], recognised_objects, message_payload['timestamp']) cropped_faces = [] for r in recognised_objects: for c in r['cropped_faces']: cropped_faces.append({ 'image_id': image_document['_id'], 'crop_face_image_path': c['face_image_path'], 'crop_face_id': c['crop_face_id'], 'face_location': c['face_location'], 'face_recognised': False, 'face_recognised_in_queue': False, 'recognised': None }) for c in cropped_faces: db_image_service.insert_crop_face_image(c) else: logging.warning( "Can't find image by %s", str({ '_id': ObjectId(message_payload['db_id']), 'path': message_payload['original_path'] })) topic_partition = TopicPartition( self.kafka_image_recognised_result_topic, msg.partition) offset = OffsetAndMetadata(msg.offset, '') consumer.commit(offsets={topic_partition: offset}) logging.info("after commit %s", message_v['consumer_id'])
def sync_stream(kafka_config, stream, state): consumer = KafkaConsumer( kafka_config['topic'], group_id=kafka_config['group_id'], enable_auto_commit=False, consumer_timeout_ms=kafka_config.get('consumer_timeout_ms', 10000), auto_offset_reset='earliest', value_deserializer=lambda m: json.loads(m.decode('ascii')), bootstrap_servers=kafka_config['bootstrap_servers']) send_schema_message(stream) stream_version = singer.get_bookmark(state, stream['tap_stream_id'], 'version') if stream_version is None: stream_version = int(time.time() * 1000) state = singer.write_bookmark(state, stream['tap_stream_id'], 'version', stream_version) singer.write_message(singer.StateMessage(value=copy.deepcopy(state))) activate_version_message = singer.ActivateVersionMessage( stream=stream['tap_stream_id'], version=stream_version) singer.write_message(activate_version_message) time_extracted = utils.now() rows_saved = 0 for message in consumer: LOGGER.info("%s:%s:%s: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) # stream['schema'] record = singer.RecordMessage(stream=stream['tap_stream_id'], record=message.value, time_extracted=time_extracted) [valid, error] = validate_record(stream['schema'], record) rows_saved = rows_saved + 1 if valid: singer.write_message(record) elif kafka_config.get('reject_topic'): send_reject_message(kafka_config, record, error) else: raise Exception( "record failed validation and no reject_topic was specified") state = singer.write_bookmark(state, stream['tap_stream_id'], 'offset', message.offset) #commit offsets because we processed the message tp = TopicPartition(message.topic, message.partition) consumer.commit({tp: OffsetAndMetadata(message.offset + 1, None)}) if rows_saved % UPDATE_BOOKMARK_PERIOD == 0: singer.write_message( singer.StateMessage(value=copy.deepcopy(state))) singer.write_message(singer.StateMessage(value=copy.deepcopy(state)))
def get_cons_offsets(topic, cons): """ Get current committed consumer group offsets for topic. """ partitions_for_topic = cons.partitions_for_topic(topic) partitions = [TopicPartition(topic, i) for i in partitions_for_topic] committed_offsets = { partition: OffsetAndMetadata(cons.committed(partition), None) for partition in partitions } return committed_offsets
def teardown_kafka_reader(q, positions, _setup=None): q.cancel_join_thread() cons = _setup['consumer'] offsets = dict([(tp, OffsetAndMetadata(offset=positions[tp.partition], metadata='')) for tp in cons.assignment()]) cons.commit(offsets) cons.close(autocommit=False) logger.info("exit kafka reader process") raise OpExit("exit kafka reader process")
def reset_offset(self, reset_offset_value): # Trigger Rebalance self.consumer.subscribe(self.topic) self.consumer.poll(0) # Reset Offset partitions_offset = {} for partition_id in self.consumer.partitions_for_topic(self.topic): partitions_offset[TopicPartition( topic, partition_id)] = OffsetAndMetadata(reset_offset_value, '') self.consumer.commit(partitions_offset)
def success_result(self, msg, consumer, result): """ :param msg: ConsumerRecord :param consumer: KafkaConsumer :param result: Unwrapperd success result :return: """ if not self.enable_auto_commit: options = { TopicPartition(msg.topic, msg.partition): OffsetAndMetadata(msg.offset + 1, None) } consumer.commit(options)
def _commit_offsets(self, topic_partition: TopicPartition, offset: int): """Commits offsets for the partition of a given topic. This effectively advances the index so that future reads from the same Kafka consumer group will not read any records up to that offset. :param topic_partition: Partition of the topic where offsets are to be committed. :param offset: Largest offset read so far. :return: """ self._consumer.commit({ topic_partition: OffsetAndMetadata(offset=offset + 1, metadata=''), })
def _commit_offsets(self, partition): """Commit kafka consumer group offsets. """ commit_offsets = {} to_delete = [] for tp, offset in six.iteritems(self._consumer_offsets): if tp.partition == partition: commit_offsets[tp] = OffsetAndMetadata(offset, None) to_delete.append(tp) for tp in to_delete: del self._consumer_offsets[tp] self._consumer.commit(commit_offsets)
def get_failed_events(config, from_topic, to_topic): consumer = KafkaConsumer( bootstrap_servers=config['kafka']['hosts'], auto_offset_reset='earliest', enable_auto_commit=True, auto_commit_interval_ms=1000, group_id=to_topic ) n_partitions = len(consumer.partitions_for_topic(from_topic)) events_to_publish = list() commit_options = dict() for partition_idx in range(n_partitions): partition = TopicPartition(from_topic, partition_idx) consumer.assign([partition]) # we'll start reading from this position from_offset = consumer.position(partition) # obtain the last offset value consumer.seek_to_end(partition) to_offset = consumer.position(partition) print(f'partition_idx: {partition_idx}, from_offset: {from_offset}, to_offset: {to_offset}') # no new events since last replay if from_offset >= to_offset: continue consumer.seek(partition, from_offset) for message in consumer: event = str(message.value, 'utf-8') events_to_publish.append(event) if message.offset >= to_offset - 1: """ from kafka-python team on github (https://github.com/dpkp/kafka-python/issues/645): "the metadata is really just an opaque string. You can also pass None. Nothing uses metadata internally, it is there as a way for you to s tore application-specific data if needed." """ commit_options[partition] = OffsetAndMetadata(message.offset + 1, None) break consumer.commit(commit_options) return events_to_publish
def _method(topic): consumer = KafkaConsumer( bootstrap_servers=app.config['KAFKA_URL'], group_id='testing', key_deserializer=bytes.decode, value_deserializer=lambda v: json.loads(v.decode('utf-8')), auto_offset_reset='latest', enable_auto_commit=False) partition = TopicPartition(topic, 0) consumer.assign([partition]) last_pos = consumer.end_offsets([partition]) pos = last_pos[partition] offset = OffsetAndMetadata(pos - 1, b'') consumer.commit(offsets={partition: offset}) msg = next(consumer) consumer.close() return msg
def handler(): consumer = None topics = [topic] if isinstance(topic, str) else topic.copy() try: consumer = KafkaConsumer(*topics, **conf) for message in consumer: func(message.key, message.value, message.topic, message.partition, message.offset) # 控制 kafka topic 偏移量, if 'enable_auto_commit' in conf and not conf[ 'enable_auto_commit']: consumer.commit_async({ TopicPartition(message.topic, message.partition): OffsetAndMetadata(message.offset, None) }) finally: if consumer: consumer.close()
def seek_offset(self, topic_partition: TopicPartition, offset: int) -> None: """Seek the provided partition for a configured consumer group to a specific offset. Arguments: topic_partition (kafka.structs.TopicPartition): Non-localized topic partition. offset (int): Desired offset. """ local_tp = self._get_localized_tp(topic_partition) if self._dry_run: logger.debug( "dry_run mode: Attempted to commit on %s:%s to offset %s.", local_tp.topic, local_tp.partition, offset, ) else: self._consumer.assign([local_tp]) self._consumer.commit({local_tp: OffsetAndMetadata(offset, None)})
def consume(self) -> bool: data = list() messages = self._consumer.poll(100, 10000) if messages: # TODO: Only works if there is a single partition per consumer. As soon as the number of consumers is lower # TODO: or higher than the number of partitions this fails. for message in messages[self._consumer.assignment().pop()]: key = message.key.decode('utf-8') try: value = json.loads(message.value.decode('utf-8')) except JSONDecodeError as ex: self._error_logger.error( "Failed to JSONDecode message: {}.".format( message.value.decode('utf-8'))) value = { 'message': message.value.decode('utf-8'), 'error': '{}'.format(ex) } if self._key not in value: value['_key'] = key data.append(value) now = time.time() if len(data) > 0: result = self._index.bulk(data, self._key, op_type=self.configuration.op_type, upsert=self.configuration.upsert) then = time.time() amount = then - now self._time_logger.info( "Success! Indexed {} messages to {} in {} seconds.".format( len(data), self._index.index, amount)) else: result = False if result: for assignment in self._consumer.assignment(): pos = self._consumer.position(assignment) if pos != self._consumer.committed(assignment): self._consumer.commit( {assignment: OffsetAndMetadata(pos, "")}) return result
def commit(self, partition_offset: tuple, async_commit: bool = False): """ 提交偏移量至kafka, 阻塞直到成功或报错 need group id not None :param partition_offset: (topic, partition, offset) :param async_commit: choose async commit :return: """ topic = partition_offset[0] partition = partition_offset[1] _offset = partition_offset[2] offset = { TopicPartition(topic, partition): OffsetAndMetadata(_offset, None) } if not async_commit: self.consumer.commit(offset) else: self.consumer.commit_async(offset).add_errback(self.commit_err, topic=topic, partition=partition, offset=_offset)
def _update_tasks_status(self): offsets = {} topics_to_resume = [] for topic, task in self._runningTasks.items(): if task.get_offset() > 0: for partition in self._internal_consumer.partitions_for_topic( topic): topic_partition = TopicPartition(topic, partition) offsets[topic_partition] = OffsetAndMetadata( task.get_offset(), topic_partition) if not task.is_running(): for partition in self._internal_consumer.partitions_for_topic( topic): topic_partition = TopicPartition(topic, partition) topics_to_resume.append(topic_partition) self._internal_consumer.commit_async(offsets) for topic_partition in topics_to_resume: self._runningTasks.pop(topic_partition.topic) self._internal_consumer.resume(topic_partition)
def consumer(self, topic, msg_handler, save_handler): bootstrap_servers = '%s:%s' % (self.config.kafka_host, self.config.kafka_port) t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) INFO( f"{t} start run consumer, topic: {topic}, group_id: {self.config.group_id}, bootstrap_servers: {bootstrap_servers}" ) consumer = KafkaConsumer(bootstrap_servers=bootstrap_servers, group_id=self.config.group_id, auto_offset_reset='earliest', enable_auto_commit=False) tp = TopicPartition(topic=topic, partition=0) consumer.assign([tp]) now_offset = 0 last_offset = 0 datas = [] for msg in consumer: now_offset = msg.offset data = msg_handler(msg.value) if data: datas.append(data) if len(datas) >= self.save_cnt: try: success = save_handler(datas) INFO(f"save_handler result is:{success}") if success: datas = [] consumer.commit(offsets={ tp: (OffsetAndMetadata(now_offset + 1, 0)) }) last_offset = msg.offset else: ERROR( f"save data failed, topic: {topic}, old_offset: {last_offset}, now_offset: {now_offset}" ) except Exception as e: ERROR(f"commit consumer offset failed,Exception:{e}")
def commit(self, partition, offset): """Commit the given offset for the given partition - indicates that the messages until this offset have been successfully replayed to the target postgres instance Arguments: partition (int): kafka topic partition offset (int): offset until which processing has been successful """ if offset is None: return try: self.consumer.commit({ TopicPartition(self.topic, partition): OffsetAndMetadata(offset + 1, None) }) except Exception as e: self.logger.error( 'Cannot commit offset {} for topic:partition {}:{}. Error: {}'. format(offset, self.topic, partition, e.message)) self.metrics.measure( KafkaErrorsMeasurement(self.topic, 'CommitError')) raise e
def hgweb(): '''hgweb component of the vcsreplicator bootstrap procedure. Takes a vcsreplicator config path on the CLI and takes a JSON data structure on stdin''' import argparse # Parse CLI args parser = argparse.ArgumentParser() parser.add_argument('config', help='Path of config file to load') parser.add_argument( 'input', help= 'JSON data input (output from the hgssh bootstrap procedure) file path' ) parser.add_argument( '--workers', help='Number of concurrent workers to use for performing clones', type=int, default=multiprocessing.cpu_count()) args = parser.parse_args() logger.info('reading hgssh JSON document') with open(args.input, 'r') as f: hgssh_data = json.loads(f.read()) logger.info('JSON document read') # Convert the JSON keys to integers hgssh_data['offsets'] = { int(k): v for k, v in hgssh_data['offsets'].items() } config = Config(filename=args.config) consumer_config = { # set this so offsets are committed to Zookeeper 'api_version': (0, 8, 1), 'bootstrap_servers': config.c.get('consumer', 'hosts'), 'client_id': config.c.get('consumer', 'client_id'), 'enable_auto_commit': False, 'group_id': config.c.get('consumer', 'group'), 'max_partition_fetch_bytes': MAX_BUFFER_SIZE, 'value_deserializer': value_deserializer, } topic = config.c.get('consumer', 'topic') topicpartitions = [ TopicPartition(topic, partition) for partition, (start_offset, end_offset) in sorted(hgssh_data['offsets'].items()) # there is no need to do an assignment if the length of the # bootstrap message range is 0 if start_offset != end_offset ] consumer = KafkaConsumer(**consumer_config) outputdata = collections.defaultdict(list) # We will remove repos from this set as we replicate them # Once this is an empty set we are done repositories_to_clone = set() for repo in hgssh_data['repositories']: filterresult = config.filter(repo) if filterresult.passes_filter: repositories_to_clone.add(repo) else: outputdata[repo].append('filtered by rule %s' % filterresult.rule) extra_messages = collections.defaultdict( collections.deque) # maps repo names to extra processing messages clone_futures_repo_mapping = {} # maps cloning futures to repo names extra_messages_futures_repo_mapping = { } # maps extra messages futures to repo names # Overwrite default hglib path so handle_message_main and it's derivatives # use the correct virtualenv hglib.HGPATH = config.c.get('programs', 'hg') # Maps partitions to the list of messages within the bootstrap range aggregate_messages_by_topicpartition = { tp.partition: [] for tp in topicpartitions } # Gather all the Kafka messages within the bootstrap range for each partition for topicpartition in topicpartitions: start_offset, end_offset = hgssh_data['offsets'][ topicpartition.partition] end_offset -= 1 # Assign the consumer to the next partition and move to the start offset logger.info('assigning the consumer to partition %s' % topicpartition.partition) consumer.assign([topicpartition]) logger.info('seeking the consumer to offset %s' % start_offset) consumer.seek(topicpartition, start_offset) consumer.commit( offsets={topicpartition: OffsetAndMetadata(start_offset, '')}) logger.info( 'partition %s of topic %s moved to offset %s' % (topicpartition.partition, topicpartition.topic, start_offset)) # Get all the messages we need to process from kafka for message in consumer: # Check if the message we are processing is within the range of accepted messages # If we are in the range, add this message to the list of messages on this partition # If we are at the end of the range, break from the loop and move on to the next partition if message.offset <= end_offset: aggregate_messages_by_topicpartition[message.partition].append( message) logger.info( 'message on partition %s, offset %s has been collected' % (message.partition, message.offset)) consumer.commit( offsets={ TopicPartition(topic, message.partition): OffsetAndMetadata(message.offset + 1, ''), }) if message.offset >= end_offset: logger.info('finished retrieving messages on partition %s' % message.partition) break logger.info('finished retrieving messages from Kafka') # Process the previously collected messages with futures.ThreadPoolExecutor(args.workers) as e: for partition, messages in sorted( aggregate_messages_by_topicpartition.items()): logger.info('processing messages for partition %s' % partition) for message in messages: payload = message.value # Ignore heartbeat messages if payload['name'] == 'heartbeat-1': continue if payload['path'] in repositories_to_clone: # If we have not yet replicated the repository for this message, # of the repo sync message is not tagged with the bootstrap flag, # move on to the next message. The assumed upcoming hg-repo-sync-2 # message will clone the data represented in this message anyways. if payload['name'] != 'hg-repo-sync-2' or not payload[ 'bootstrap']: continue logger.info('scheduled clone for %s' % payload['path']) # Schedule the repo sync clone_future = e.submit(clone_repo, config, payload['path'], payload['requirements'], payload['hgrc'], payload['heads']) # Here we register the future against its repo name clone_futures_repo_mapping[clone_future] = payload['path'] # Remove the repo from the set of repos # which have not been scheduled to sync repositories_to_clone.remove(payload['path']) elif payload['path'] not in outputdata: # If the repo is not in the list of repositories to clone, # and the repo is not in the outputdata object (ie hasn't # errored out, by being filtered or otherwise), # then we have already scheduled the repo sync and we will # need to process this message once the sync completes. extra_messages[payload['path']].append((config, payload)) logger.info('extra messages found for %s: %s total' % (payload['path'], len(extra_messages[payload['path']]))) if repositories_to_clone: logger.error('did not receive expected sync messages for %s' % repositories_to_clone) # Add errors to audit output for repo in repositories_to_clone: outputdata[repo].append('did not receive sync message') # Process clones remaining_clones = len(clone_futures_repo_mapping) for completed_future in futures.as_completed( clone_futures_repo_mapping): repo = clone_futures_repo_mapping[completed_future] exc = completed_future.exception() if exc: message = 'error triggering replication of Mercurial repo %s: %s' % ( repo, str(exc)) logger.error(message) # Add error to audit output outputdata[repo].append(message) else: logger.info('%s successfully cloned' % repo) remaining_clones -= 1 logger.info('%s repositories remaining' % remaining_clones) # Schedule extra message processing if necessary if repo in extra_messages: logger.info('scheduling extra processing for %s' % repo) configs, payloads = zip(*extra_messages[repo]) future = e.submit(map, handle_message_main, configs, payloads) extra_messages_futures_repo_mapping[future] = repo # Process extra messages total_message_batches = len(extra_messages_futures_repo_mapping) for completed_future in futures.as_completed( extra_messages_futures_repo_mapping): repo = extra_messages_futures_repo_mapping[completed_future] exc = completed_future.exception() if exc: message = 'error processing extra messages for %s: %s' % ( repo, str(exc)) logger.error(message) # Add error to audit output outputdata[repo].append(message) else: logger.info('extra processing for %s completed successfully' % repo) total_message_batches -= 1 logger.info('%s batches remaining' % total_message_batches) logger.info('%s bootstrap process complete' % config.c.get('consumer', 'group')) # If anything broke, dump the errors and set exit code 1 if outputdata: with open('/repo/hg/hgweb_bootstrap_out.json', 'w') as f: f.write(json.dumps(outputdata)) return 1
def Consumer(thread_name, topic, partition): print( thread_name, "Starting\tDispose", ) global is_dispose broker_list = '172.16.90.63:6667, 172.16.90.58:6667, 172.16.90.59:6667' ''' fetch_min_bytes(int) - 服务器为获取请求而返回的最小数据量,否则请等待 fetch_max_wait_ms(int) - 如果没有足够的数据立即满足fetch_min_bytes给出的要求,服务器在回应提取请求之前将阻塞的最大时间量(以毫秒为单位) fetch_max_bytes(int) - 服务器应为获取请求返回的最大数据量。这不是绝对最大值,如果获取的第一个非空分区中的第一条消息大于此值, 则仍将返回消息以确保消费者可以取得进展。注意:使用者并行执行对多个代理的提取,因此内存使用将取决于包含该主题分区的代理的数量。 支持的Kafka版本> = 0.10.1.0。默认值:52428800(50 MB)。 enable_auto_commit(bool) - 如果为True,则消费者的偏移量将在后台定期提交。默认值:True。 max_poll_records(int) - 单次调用中返回的最大记录数poll()。默认值:500 max_poll_interval_ms(int) - poll()使用使用者组管理时的调用之间的最大延迟 。这为消费者在获取更多记录之前可以闲置的时间量设置了上限。 如果 poll()在此超时到期之前未调用,则认为使用者失败,并且该组将重新平衡以便将分区重新分配给另一个成员。默认300000 ''' consumer = KafkaConsumer( bootstrap_servers=broker_list, group_id="xiaofei", client_id=thread_name, # auto_offset_reset="smallest", enable_auto_commit=False, fetch_min_bytes=1024 * 1024, # fetch_max_bytes=1024 * 1024 * 1024 * 10, fetch_max_wait_ms=60000, request_timeout_ms=305000, # consumer_timeout_ms=1, # max_poll_records=5000, # max_poll_interval_ms=60000 无该参数 ) dic = get_kafka(topic, partition) tp = TopicPartition(topic, partition) # print(thread_name, tp, dic['offset']) consumer.assign([tp]) # 重定向分区offset consumer.seek(tp, dic['offset']) print("程序首次运行\t线程:", thread_name, "分区:", partition, "偏移量:", dic['offset'], "\t开始消费...") num = 0 # end_offset = consumer.end_offsets([tp])[tp] # print(end_offset) while True: args = OrderedDict() checkThread() msg = consumer.poll(timeout_ms=60000) end_offset = consumer.end_offsets([tp])[tp] print('已保存的偏移量', consumer.committed(tp), '最新偏移量,', end_offset) # 测试线程死掉 # if thread_name=="Thread-1" and num==2: # sys.exit() if len(thread_msg) > 0 and is_dispose is True: is_dispose = False for msg_send in thread_msg: exp(msg_send) send_msg(msg_send) thread_msg.clear() if len(msg) > 0: print("线程:", thread_name, "分区:", partition, "最大偏移量:", end_offset, "有无数据,", len(msg)) lines = 0 for data in msg.values(): for line in data: lines += 1 line = eval(line.value.decode('utf-8')) value, log_name = get_line(col_dic, line) sql = sql_dic[log_name] if value is not None: args.setdefault(sql, []).append(tuple(value)) print(thread_name, "处理条数", lines) # 数据保存至数据库 is_succeed = save_to_db(args, thread_name) if is_succeed: # 更新保存在数据库中的分区的偏移量 is_succeed1 = update_offset(topic, partition, end_offset) # 手动提交偏移量到kafka consumer.commit( offsets={tp: (OffsetAndMetadata(end_offset, None))}) # print(thread_name,"to db suss",num+1) if is_succeed1 == 0: sys.exit() else: sys.exit() else: pass # print(thread_name,'没有数据') # time.sleep(60) num += 1
def test_commit(consumer): partition = TopicPartition('test', 2) offset_metadata = OffsetAndMetadata(2, 'xx') response = consumer.commit({partition: offset_metadata}) print(response)
args = { # "security_protocol": "SSL", # "ssl_cafile": "/Users/yixiang/Projects/ISI/mydig-projects/dig3_ht/kafka_ssl/ca-cert.pem", # "ssl_certfile": "/Users/yixiang/Projects/ISI/mydig-projects/dig3_ht/kafka_ssl/client-cert.pem", # "ssl_keyfile": "/Users/yixiang/Projects/ISI/mydig-projects/dig3_ht/kafka_ssl/client-key.pem", # "ssl_check_hostname": False } part_num = 1 group_id = 'dig_test' topic_id = 'test' consumer = KafkaConsumer( bootstrap_servers=broker_list, group_id=group_id, **args ) meta = consumer.partitions_for_topic(topic_id) assigned_parts = [] for i in range(part_num): assigned_parts.append(TopicPartition(topic_id, i)) consumer.assign(assigned_parts) for p in assigned_parts: consumer.seek(p, 0) # sometimes it is blocked, need to restart consumer.commit({p:OffsetAndMetadata(0, meta)}) print 'done'