class ConsistentScheduler(object): ''' LibPartitionHelper abstract out workers and work_items, and their mapping to partitions. So application can only deal with the work items it owns, without bothering about partition mapping. This class also provides syncronization premitives to ensure apps to clean up b4 giving up their partitions ''' _MAX_WAIT_4_ALLOCATION = 6 + randint(0, 9) def __init__(self, service_name=None, zookeeper='127.0.0.1:2181', delete_hndlr=None, add_hndlr=None, bucketsize=47, item2part_func=None, partitioner=None, logger=None): if logger: self._logger = logger else: self._logger = logging.getLogger(__name__) self._service_name = service_name or os.path.basename(sys.argv[0]) self._item2part_func = item2part_func or self._device2partition self._zookeeper_srvr = zookeeper self._bucketsize = bucketsize self._delete_hndlr = delete_hndlr self._add_hndlr = add_hndlr self._partitioner = partitioner or self._partitioner_func self._partitions = {} self._con_hash = None self._last_log = '' self._last_log_cnt = 0 self._partition_set = map(str, range(self._bucketsize)) self._zk_path = '/'.join(['/contrail_cs', self._service_name]) self._zk = KazooClient(self._zookeeper_srvr) self._zk.add_listener(self._zk_lstnr) self._zk.start() self._pc = self._zk.SetPartitioner(path=self._zk_path, set=self._partition_set, partition_func=self._partitioner) self._wait_allocation = 0 gevent.sleep(0) def _zk_lstnr(self, state): self._supress_log('zk state change to %s' % str(state)) def schedule(self, items, lock_timeout=30): gevent.sleep(0) ret = False if self._pc.failed: raise Exception("Lost or unable to acquire partition") elif self._pc.release: self._supress_log('Releasing...') self._release() elif self._pc.allocating: self._supress_log('Waiting for allocation...') self._pc.wait_for_acquire(lock_timeout) if self._wait_allocation < self._MAX_WAIT_4_ALLOCATION: self._wait_allocation += 1 else: raise StopIteration('Giving up after %d tries!' % (self._wait_allocation)) elif self._pc.acquired: self._supress_log('got work: ', list(self._pc)) ret = True self._wait_allocation = 0 self._populate_work_items(items) self._supress_log('work items: ', self._items2name(self.work_items()), 'from the list', self._items2name(items)) return ret def work_items(self): return sum(self._partitions.values(), []) def finish(self): self._inform_delete(self._partitions.keys()) self._pc.finish() def _items2name(self, items): return map(lambda x: x.name, items) def _supress_log(self, *s): slog = ' '.join(map(str, s)) dl = '' if slog != self._last_log_cnt: if self._last_log_cnt: dl += ' ' * 4 dl += '.' * 8 dl += '[last print repeats %d times]' % self._last_log_cnt self._last_log_cnt = 0 dl += slog self._last_log = slog self._logger.debug(dl) else: self._last_log_cnt += 1 def _consistent_hash(self, members): if self._con_hash is None: self._con_hash = ConsistentHash(members) self._supress_log('members:', self._con_hash.nodes) cur, updtd = set(self._con_hash.nodes), set(members) if cur != updtd: newm = updtd - cur rmvd = cur - updtd if newm: self._supress_log('new workers:', newm) self._con_hash.add_nodes(list(newm)) if rmvd: self._supress_log('workers left:', rmvd) self._con_hash.del_nodes(list(rmvd)) return self._con_hash def _consistent_hash_get_node(self, members, partition): return self._consistent_hash(members).get_node(partition) def _partitioner_func(self, identifier, members, _partitions): return [p for p in _partitions \ if self._consistent_hash_get_node(members, p) == identifier] def _release(self): old = set(self._pc) new = set( self._partitioner(self._pc._identifier, list(self._pc._party), self._partition_set)) rmvd = old - new added = new - old if rmvd: self._inform_delete(list(rmvd)) if added: self._inform_will_add(list(added)) self._pc.release_set() def _list_items_in(self, partitions): return sum([self._partitions[k] for k in partitions if k in \ self._partitions], []) def _inform_will_add(self, partitions): if callable(self._add_hndlr): self._add_hndlr(self._list_items_in(partitions)) def _inform_delete(self, partitions): if callable(self._delete_hndlr): self._delete_hndlr(self._list_items_in(partitions)) def _populate_work_items(self, items): self._refresh_work_items() for i in items: part = str(self._item2part_func(i.name)) if part in list(self._pc): if part not in self._partitions: self._partitions[part] = [] if i.name not in map(lambda x: x.name, self._partitions[part]): self._partitions[part].append(i) self._logger.debug('@populate_work_items(%s): done!' % ' '.join( map( lambda v: str(v[0]) + ':' + ','.join( map(lambda x: x.name, v[1])), self._partitions.items()))) gevent.sleep(0) def _device2partition(self, key): return struct.unpack( 'Q', hashlib.md5(key).digest()[-8:])[0] % self._bucketsize def _refresh_work_items(self): for k in self._partitions: self._partitions[k] = []
class ConsistentScheduler(object): ''' LibPartitionHelper abstract out workers and work_items, and their mapping to partitions. So application can only deal with the work items it owns, without bothering about partition mapping. This class also provides syncronization premitives to ensure apps to clean up b4 giving up their partitions ''' _MAX_WAIT_4_ALLOCATION = 6 + randint(0, 9) def __init__(self, service_name=None, zookeeper='127.0.0.1:2181', delete_hndlr=None, add_hndlr=None, bucketsize=47, item2part_func=None, partitioner=None, logger=None, cluster_id=''): if logger: self._logger = logger else: self._logger = logging.getLogger(__name__) self._service_name = service_name or os.path.basename(sys.argv[0]) self._item2part_func = item2part_func or self._device2partition self._zookeeper_srvr = zookeeper self._zk = None self._bucketsize = bucketsize self._delete_hndlr = delete_hndlr self._add_hndlr = add_hndlr self._partitioner = partitioner or self._partitioner_func self._partitions = {} self._con_hash = None self._last_log = '' self._last_log_cnt = 0 self._partition_set = map(str, range(self._bucketsize)) self._cluster_id = cluster_id if self._cluster_id: self._zk_path = '/' + self._cluster_id + '/contrail_cs' + '/' + self._service_name else: self._zk_path = '/'.join(['/contrail_cs', self._service_name]) self._conn_state = None self._sandesh_connection_info_update(status='INIT', message='') while True: self._logger.error("Consistent scheduler zk start") self._zk = KazooClient(self._zookeeper_srvr, handler=SequentialGeventHandler()) self._zk.add_listener(self._zk_lstnr) try: self._zk.start() while self._conn_state != ConnectionStatus.UP: gevent.sleep(1) break except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) self._zk.remove_listener(self._zk_lstnr) try: self._zk.stop() self._zk.close() except Exception as ex: template = "Exception {0} in Consistent scheduler zk stop/close. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s" % \ (messag, traceback.format_exc(), self._service_name)) finally: self._zk = None gevent.sleep(1) self._pc = self._zk.SetPartitioner(path=self._zk_path, set=self._partition_set, partition_func=self._partitioner) self._wait_allocation = 0 gevent.sleep(0) def _sandesh_connection_info_update(self, status, message): new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER, name='Zookeeper', status=new_conn_state, message=message, server_addrs=self._zookeeper_srvr.split(',')) if ((self._conn_state and self._conn_state != ConnectionStatus.DOWN) and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' % (message) self._supress_log(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._supress_log(msg) self._conn_state = new_conn_state # end _sandesh_connection_info_update def _zk_lstnr(self, state): self._logger.error("Consistent scheduler listen %s" % str(state)) if state == KazooState.CONNECTED: # Update connection info self._sandesh_connection_info_update(status='UP', message='') elif state == KazooState.LOST: self._logger.error("Consistent scheduler connection LOST") # Lost the session with ZooKeeper Server # Best of option we have is to exit the process and restart all # over again self._sandesh_connection_info_update( status='DOWN', message='Connection to Zookeeper lost') os._exit(2) elif state == KazooState.SUSPENDED: self._logger.error("Consistent scheduler connection SUSPENDED") # Update connection info self._sandesh_connection_info_update( status='INIT', message='Connection to zookeeper lost. Retrying') def schedule(self, items, lock_timeout=30): gevent.sleep(0) ret = False if self._pc.failed: self._logger.error('Lost or unable to acquire partition') os._exit(2) elif self._pc.release: self._supress_log('Releasing...') self._release() elif self._pc.allocating: self._supress_log('Waiting for allocation...') self._pc.wait_for_acquire(lock_timeout) if self._wait_allocation < self._MAX_WAIT_4_ALLOCATION: self._wait_allocation += 1 else: self._logger.error('Giving up after %d tries!' % (self._wait_allocation)) os._exit(2) elif self._pc.acquired: self._supress_log('got work: ', list(self._pc)) ret = True self._wait_allocation = 0 self._populate_work_items(items) self._supress_log('work items: ', self._items2name(self.work_items()), 'from the list', self._items2name(items)) return ret def members(self): return list(self._con_hash.nodes) def partitions(self): return list(self._pc) def work_items(self): return sum(self._partitions.values(), []) def finish(self): self._inform_delete(self._partitions.keys()) self._pc.finish() self._zk.remove_listener(self._zk_lstnr) gevent.sleep(1) try: self._zk.stop() except: self._logger.error("Stopping kazooclient failed") else: self._logger.error("Stopping kazooclient successful") try: self._zk.close() except: self._logger.error("Closing kazooclient failed") else: self._logger.error("Closing kazooclient successful") def _items2name(self, items): return map(lambda x: x.name, items) def _supress_log(self, *s): slog = ' '.join(map(str, s)) dl = '' if slog != self._last_log_cnt: if self._last_log_cnt: dl += ' ' * 4 dl += '.' * 8 dl += '[last print repeats %d times]' % self._last_log_cnt self._last_log_cnt = 0 dl += slog self._last_log = slog self._logger.debug(dl) else: self._last_log_cnt += 1 def _consistent_hash(self, members): if self._con_hash is None: self._con_hash = ConsistentHash(members) self._logger.error('members: %s' % (str(self._con_hash.nodes))) cur, updtd = set(self._con_hash.nodes), set(members) if cur != updtd: newm = updtd - cur rmvd = cur - updtd if newm: self._logger.error('new members: %s' % (str(newm))) self._con_hash.add_nodes(list(newm)) if rmvd: self._logger.error('members left: %s' % (str(rmvd))) self._con_hash.del_nodes(list(rmvd)) return self._con_hash def _consistent_hash_get_node(self, members, partition): return self._consistent_hash(members).get_node(partition) def _partitioner_func(self, identifier, members, _partitions): partitions = [p for p in _partitions \ if self._consistent_hash_get_node(members, p) == identifier] self._logger.error('partitions: %s' % (str(partitions))) return partitions def _release(self): old = set(self._pc) new = set( self._partitioner(self._pc._identifier, list(self._pc._party), self._partition_set)) rmvd = old - new added = new - old if rmvd: self._inform_delete(list(rmvd)) if added: self._inform_will_add(list(added)) self._pc.release_set() def _list_items_in(self, partitions): return sum([self._partitions[k] for k in partitions if k in \ self._partitions], []) def _inform_will_add(self, partitions): if callable(self._add_hndlr): self._add_hndlr(self._list_items_in(partitions)) def _inform_delete(self, partitions): if callable(self._delete_hndlr): self._delete_hndlr(self._list_items_in(partitions)) def _populate_work_items(self, items): self._refresh_work_items() for i in items: part = str(self._item2part_func(i.name)) if part in list(self._pc): if part not in self._partitions: self._partitions[part] = [] if i.name not in map(lambda x: x.name, self._partitions[part]): self._partitions[part].append(i) self._logger.debug('@populate_work_items(%s): done!' % ' '.join( map( lambda v: str(v[0]) + ':' + ','.join( map(lambda x: x.name, v[1])), self._partitions.items()))) gevent.sleep(0) def _device2partition(self, key): return struct.unpack( 'Q', hashlib.md5(key).digest()[-8:])[0] % self._bucketsize def _refresh_work_items(self): for k in self._partitions: self._partitions[k] = []
class Partitioner(object): """Partitioner is used to handle distributed a set of topics/partitions among a group of consumers. :param topics: kafka topics :type topics: list :param acquire: function to be called when a set of partitions has been acquired. It should usually allocate the consumers. :param release: function to be called when the acquired partitions have to be release. It should usually stops the consumers. """ def __init__(self, config, topics, acquire, release): self.log = logging.getLogger(self.__class__.__name__) self.config = config # Clients self.kazoo_client = None self.kafka_client = None self.topics = topics self.acquired_partitions = defaultdict(list) self.partitions_set = set() # User callbacks self.acquire = acquire self.release = release # We guarantee that the user defined release function call follows # always the acquire. release function will never be called twice in a # row. Initialize to true because no partitions have been acquired at # startup. self.released_flag = True # Kafka metadata refresh self.force_partitions_refresh = True self.last_partitions_refresh = 0 # Kazoo partitioner self._partitioner = None # Map Kazoo partitioner state to actions self.actions = { PartitionState.ALLOCATING: self._allocating, PartitionState.ACQUIRED: self._acquire, PartitionState.RELEASE: self._release, PartitionState.FAILURE: self._fail } self.kazoo_retry = None self.zk_group_path = build_zk_group_path( self.config.group_path, self.topics, ) if self.config.use_group_sha else self.config.group_path def start(self): """Create a new group and wait until the partitions have been acquired. This function should never be called twice. :raises: PartitionerError upon partitioner failures .. note: This is a blocking operation. """ self.kazoo_retry = KazooRetry(**KAZOO_RETRY_DEFAULTS) self.kazoo_client = KazooClient( self.config.zookeeper, connection_retry=self.kazoo_retry, ) self.kafka_client = KafkaClient(self.config.broker_list) self.log.debug("Starting a new group for topics %s", self.topics) self.released_flag = True self._refresh() def __enter__(self): self.start() def __exit__(self, exc_type, exc_value, traceback): self.stop() def stop(self): """Leave the group and release the partitions.""" self.log.debug("Stopping group for topics %s", self.topics) self.release_and_finish() self._close_connections() def refresh(self): """Rebalance upon group changes, such as when a consumer joins/leaves the group, the partitions for a topics change, or the partitioner itself fails (connection to zookeeper lost). This method should be called periodically to make sure that the group is in sync. :raises: PartitionerError upon partitioner failures """ self.log.debug("Refresh group for topics %s", self.topics) self._refresh() def _refresh(self): while True: partitioner = self._get_partitioner() self._handle_group(partitioner) if self.acquired_partitions: break def need_partitions_refresh(self): return (self.force_partitions_refresh or self.last_partitions_refresh < time.time() - PARTITIONS_REFRESH_TIMEOUT) def _get_partitioner(self): """Get an instance of the partitioner. When the partitions set changes we need to destroy the partitioner and create another one. If the partitioner does not exist yet, create a new partitioner. If the partitions set changed, destroy the partitioner and create a new partitioner. Different consumer will eventually use the same partitions set. :param partitions: the partitions set to use for partitioner. :type partitions: set """ if self.need_partitions_refresh() or not self._partitioner: try: partitions = self.get_partitions_set() except Exception: self.log.exception("Failed to get partitions set from Kafka." "Releasing the group.") self.release_and_finish() raise PartitionerError( "Failed to get partitions set from Kafka", ) self.force_partitions_refresh = False self.last_partitions_refresh = time.time() if partitions != self.partitions_set: # If partitions changed we release the consumers, destroy the # partitioner and disconnect from zookeeper. self.log.info( "Partitions set changed. New partitions: %s. " "Old partitions %s. Rebalancing...", [p for p in partitions if p not in self.partitions_set], [p for p in self.partitions_set if p not in partitions]) # We need to destroy the existing partitioner before creating # a new one. self.release_and_finish() self._partitioner = self._create_partitioner(partitions) self.partitions_set = partitions return self._partitioner def _create_partitioner(self, partitions): """Connect to zookeeper and create a partitioner""" if self.kazoo_client.state != KazooState.CONNECTED: try: self.kazoo_client.start() except Exception: self.log.exception("Impossible to connect to zookeeper") self.release_and_finish() raise PartitionerError("Zookeeper connection failure") self.log.debug( "Creating partitioner for group %s, topic %s," " partitions set %s", self.config.group_id, self.topics, partitions) return self.kazoo_client.SetPartitioner( path=self.zk_group_path, set=partitions, time_boundary=self.config.partitioner_cooldown, ) def release_and_finish(self): """Release consumers and terminate the partitioner""" if self._partitioner: self._release(self._partitioner) self._partitioner.finish() self._partitioner = None def _close_connections(self): self.kafka_client.close() self.partitions_set = set() self.last_partitions_refresh = 0 self.kazoo_client.stop() self.kazoo_client.close() self.kazoo_retry = None def _handle_group(self, partitioner): """Handle group status changes, for example when a new consumer joins or leaves the group. """ if partitioner: try: self.actions[partitioner.state](partitioner) except KeyError: self.log.exception("Unexpected partitioner state.") self.release_and_finish() raise PartitionerError("Invalid partitioner state %s" % partitioner.state) def _allocating(self, partitioner): """Usually we don't want to do anything but waiting in allocating state. """ partitioner.wait_for_acquire() def _acquire(self, partitioner): """Acquire kafka topics-[partitions] and start the consumers for them. """ acquired_partitions = self._get_acquired_partitions(partitioner) if acquired_partitions != self.acquired_partitions: # TODO: Decrease logging level self.log.info( "Total number of acquired partitions = %s" "It was %s before. Added partitions %s. Removed partitions %s", len(acquired_partitions), len(self.acquired_partitions), [ p for p in acquired_partitions if p not in self.acquired_partitions ], [ p for p in self.acquired_partitions if p not in acquired_partitions ], ) self.acquired_partitions = acquired_partitions try: self.acquire(copy.deepcopy(self.acquired_partitions)) self.released_flag = False except Exception: self.log.exception("Acquire action failed.") trace = traceback.format_exc() self.release_and_finish() raise PartitionerError( "Acquire action failed." "Acquire error: {trace}".format(trace=trace)) def _release(self, partitioner): """Release the consumers and acquired partitions. This function is executed either at termination time or whenever there is a group change. """ self.log.debug("Releasing partitions") try: if not self.released_flag: self.release(self.acquired_partitions) self.released_flag = True except Exception: trace = traceback.format_exc() self.log.exception("Release action failed.") raise PartitionerError( "Release action failed." "Release error: {trace}".format(trace=trace), ) partitioner.release_set() self.acquired_partitions.clear() self.force_partitions_refresh = True def _fail(self, partitioner): """Handle zookeeper failures. Executed when the consumer group is not able to recover the connection. In this case, we cowardly stop the running consumers. """ self.log.error("Lost or unable to acquire partitions") self.release_and_finish() raise PartitionerZookeeperError( "Internal partitioner error. " "Lost connection to zookeeper: {cluster}".format( cluster=self.config.zookeeper, )) def _get_acquired_partitions(self, partitioner): """Retrieve acquired partitions from a partitioner. :returns: acquired topic and partitions :rtype: dict {<topic>: <[partitions]>} """ acquired_partitions = defaultdict(list) for partition in partitioner: topic, partition_id = partition.rsplit('-', 1) acquired_partitions[topic].append(int(partition_id)) return acquired_partitions def get_partitions_set(self): """ Load partitions metadata from kafka and create a set containing "<topic>-<partition_id>" :returns: partitions for user topics :rtype: set :raises PartitionerError: if no partitions have been found """ topic_partitions = get_kafka_topics(self.kafka_client) partitions = [] missing_topics = set() for topic in self.topics: kafka_topic = kafka_bytestring(topic) if kafka_topic not in topic_partitions: missing_topics.add(topic) else: partitions += [ "{0}-{1}".format(topic, p) for p in topic_partitions[kafka_topic] ] if missing_topics: self.log.info("Missing topics: %s", missing_topics) if not partitions: self.release_and_finish() raise PartitionerError( "No partitions found for topics: {topics}".format( topics=self.topics)) return set(partitions)