class ZookeeperMasterDetector(FutureMasterDetector): @classmethod def from_uri(cls, uri): url = urlparse(uri) if url.scheme.lower() != 'zk': raise self.InvalidUrl('ZookeeperMasterDetector got invalid ensemble URI %s' % uri) return cls(url.netloc, url.path) def __init__(self, ensemble, path): super(ZookeeperMasterDetector, self).__init__() self._kazoo_client = KazooClient(ensemble) self._kazoo_client.start_async() self._group = MesosKazooGroup(self._kazoo_client, path) self._group.monitor(callback=self.on_change) def on_change(self, membership): if membership: leader = sorted(membership)[0] self._group.info(leader, callback=self.on_appointment) self._group.monitor(membership, callback=self.on_change) def on_appointment(self, master_data): master_info = MasterInfo() master_info.MergeFromString(master_data) self.appoint(master_info_to_pid(master_info))
def main(): """ Main entry point for this persistent daemon. """ # Set up the argument parser. parser = argparse.ArgumentParser(description='Run an LSDA worker node.') parser.add_argument('--zookeeper', action='append', required=True) parser.add_argument('--amqp', required=True) parser.add_argument('--queue', default='stable') options = parser.parse_args() # Connect to ZooKeeper. zookeeper = KazooClient( hosts=','.join(options.zookeeper), handler=SequentialGeventHandler() ) zookeeper.start_async() # Connect to AMQP. parameters = pika.ConnectionParameters(options.amqp) connection = pika.BlockingConnection(parameters) logging_channel = connection.channel() jobs_channel = connection.channel() # Configure logging. handler = AMQPLoggingHandler(logging_channel, 'lsda_logs') logging.getLogger().addHandler(handler) logging.getLogger().setLevel(logging.INFO) # Disable extraneous packet logs from Kazoo. import kazoo.client kazoo.client.log.setLevel(logging.WARN) # Ensure that the queue we will pull from exists. jobs_channel.queue_declare(options.queue, durable=True) # Prevent flapping tasks from whacking resource. gevent.sleep(10) # Begin processing requests. try: EngineOrControllerRunner(zookeeper, jobs_channel, options.queue, handler).join() except Exception: logging.exception("Unhandled exception at root level.") raise
def test_session_callback_states(self): from kazoo.protocol.states import KazooState, KeeperState from kazoo.client import KazooClient client = KazooClient() client._handle = 1 client._live.set() result = client._session_callback(KeeperState.CONNECTED) eq_(result, None) # Now with stopped client._stopped.set() result = client._session_callback(KeeperState.CONNECTED) eq_(result, None) # Test several state transitions client._stopped.clear() client.start_async = lambda: True client._session_callback(KeeperState.CONNECTED) eq_(client.state, KazooState.CONNECTED) client._session_callback(KeeperState.AUTH_FAILED) eq_(client.state, KazooState.LOST) client._handle = 1 client._session_callback(-250) eq_(client.state, KazooState.SUSPENDED)
class ServicePublisher: def __init__(self, hosts, timeout, publish_port): self._logger = logging.getLogger(self.__class__.__name__) self._publish_port = publish_port self._zk = KazooClient(hosts=hosts) event = self._zk.start_async() event.wait(timeout=timeout) if self._zk.connected: self._logger.info('Kazoo client successfully connected') self._publish_status() else: self._zk.stop() self._logger.error('Kazoo client failed to connect') def _publish_status(self): full_path = '%s/%s' % (PARENT_NODE, socket.gethostname()) data = { 'started': str(datetime.datetime.now())[:19], 'port': self._publish_port } json_data = json.dumps(data).encode(encoding='utf-8') self._logger.info('Publishing status %s to path %s' % (data, full_path)) self._zk.create(full_path, json_data, ephemeral=True, makepath=True)
def init_app(self, app): """ Read kazoo settings from app configuration, setup kazoo client for application :param app: Flask application instance. """ app.config.setdefault('KAZOO_HOSTS', '127.0.0.1:2181') app.config.setdefault('KAZOO_START_TIMEOUT', 3) app.config.setdefault('KAZOO_START_BLOCKING', False) app.config.setdefault('KAZOO_SESSION_TIMEOUT', 10.0) # kazoo default app.config.setdefault('KAZOO_DEFAULT_RETRY', True) app.config.setdefault('KAZOO_RETRY_MAX_DELAY_SECONDS', 60 * 60) # kazoo default of 1hr. # Put cqlengine to application extensions if not 'kazoo' in app.extensions: app.extensions['kazoo'] = {} # Initialize connection and store it to extensions if app.config['KAZOO_DEFAULT_RETRY']: retry_kwargs = { 'max_delay': app.config['KAZOO_RETRY_MAX_DELAY_SECONDS'] } else: retry_kwargs = None kazoo_client = KazooClient(hosts=app.config['KAZOO_HOSTS'], timeout=app.config['KAZOO_SESSION_TIMEOUT'], connection_retry=retry_kwargs, command_retry=retry_kwargs) if app.config['KAZOO_START_BLOCKING']: kazoo_client.start(app.config['KAZOO_START_TIMEOUT']) else: kazoo_client.start_async() kazoo_client.add_listener(self.connection_state_listener) app.extensions['kazoo']['client'] = kazoo_client
def start_kazoo(host: str, credentials: str) -> KazooClient: """Starts a connection to the Zookeeper client""" zk_client = KazooClient(hosts=host) zk_client.add_auth_async("digest", credentials) try: event = zk_client.start_async() event.wait(timeout=10) logger.info("Zookeeper connection established") except KazooTimeoutError as err: ErrorCodes.make_graceful(err, "Zookeeper server timed out") sys.exit(ErrorCodes.KAZOO_TIMEOUT.value) return zk_client
class ZookeeperSession(BaseClient): conext_manager = ZookeeperResponseContextManager loose_policy = {} strict_policy = {} def __init__(self,server_list='127.0.0.1:2181',*args,**kwargs): super(ZookeeperSession,self).__init__(*args,**kwargs) self.session_policy = "loose_policy" self._zookeeper_client = None self.server_list = server_list def set_session_policy(self,session_policy="loose"): '''prototype not currenlty used. ''' self.session_policy = session_policy+"_policy" def connect(self,*args,**kwargs): '''See http://kazoo.readthedocs.org/en/latest/api/client.html for details regarding available options. Any provided client start() parameters provided will override defaults. ''' defaults = { "hosts" : self.server_list, "handler" : SequentialGeventHandler() } defaults.update(getattr(self,self.session_policy)) defaults.update(kwargs) self._state = KazooState.LOST self._zookeeper_client = KazooClient(**defaults) self._zookeeper_client.add_listener(self._state_tracker) watchable = self._zookeeper_client.start_async() watchable.wait(30) if not self._zookeeper_client.connected: err = "Could not connect to Zookeeper server(s) %(server_list)s" % defaults raise ResponseError(err) @require_state(KazooState.CONNECTED) @record_stats def ensure_path(self,path,watcher=None): self._zookeeper_client.ensure_path(path,watcher) def _state_tracker(self,state): self._state = state def __del__(self): if isinstance(self._zookeeper_client, KazooClient): self._zookeeper_client.stop()
def test_session_callback_states(self): from kazoo.client import (KazooClient, KazooState, KeeperState, EventType) client = KazooClient() client._handle = 1 client._live.set() result = client._session_callback(1, EventType.CREATED, KeeperState.CONNECTED, '/') eq_(result, None) # Now with stopped client._stopped.set() result = client._session_callback(1, EventType.SESSION, KeeperState.CONNECTED, '/') eq_(result, None) # Test several state transitions client._stopped.clear() client.start_async = lambda: True client._session_callback(1, EventType.SESSION, KeeperState.CONNECTED, None) eq_(client.state, KazooState.CONNECTED) client._session_callback(1, EventType.SESSION, KeeperState.AUTH_FAILED, None) eq_(client._handle, None) eq_(client.state, KazooState.LOST) client._handle = 1 client._session_callback(1, EventType.SESSION, -250, None) eq_(client.state, KazooState.SUSPENDED) # handle mismatch client._handle = 0 # This will be ignored due to handle mismatch client._session_callback(1, EventType.SESSION, KeeperState.CONNECTED, None) eq_(client.state, KazooState.SUSPENDED)
class Coordinator(object): def __init__(self, zk_hosts, hostname, port, join_cluster): self.me = '%s:%s' % (hostname, port) self.is_master = None self.slaves = cycle([]) self.slave_count = 0 self.started_shutdown = False if join_cluster: read_only = False else: read_only = True self.zk = KazooClient(hosts=zk_hosts, handler=SequentialGeventHandler(), read_only=read_only) event = self.zk.start_async() event.wait(timeout=5) self.lock = self.zk.Lock(path='/iris/sender_master', identifier=self.me) # Used to keep track of slaves / senders present in cluster self.party = Party(client=self.zk, path='/iris/sender_nodes', identifier=self.me) if join_cluster: self.zk.add_listener(self.event_listener) self.party.join() def am_i_master(self): return self.is_master # Used for API to get the current master def get_current_master(self): try: contenders = self.lock.contenders() except kazoo.exceptions.KazooException: logger.exception('Failed getting contenders') return None if contenders: return self.address_to_tuple(contenders[0]) else: return None # Used for API to get the current slaves if master can't be reached def get_current_slaves(self): return [self.address_to_tuple(host) for host in self.party] def address_to_tuple(self, address): try: host, port = address.split(':') return host, int(port) except (IndexError, ValueError): logger.error('Failed getting address tuple from %s', address) return None def update_status(self): if self.started_shutdown: return if self.zk.state == KazooState.CONNECTED: if self.lock.is_acquired: self.is_master = True else: try: self.is_master = self.lock.acquire(blocking=False, timeout=2) # This one is expected when we're recovering from ZK being down except kazoo.exceptions.CancelledError: self.is_master = False except kazoo.exceptions.LockTimeout: self.is_master = False logger.exception( 'Failed trying to acquire lock (shouldn\'t happen as we\'re using nonblocking locks)' ) except kazoo.exceptions.KazooException: self.is_master = False logger.exception( 'ZK problem while Failed trying to acquire lock') else: logger.error('ZK connection is in %s state', self.zk.state) self.is_master = False if self.zk.state == KazooState.CONNECTED: if self.is_master: slaves = [ self.address_to_tuple(host) for host in self.party if host != self.me ] self.slave_count = len(slaves) self.slaves = cycle(slaves) else: self.slaves = cycle([]) self.slave_count = 0 # Keep us as part of the party, so the current master sees us as a slave if not self.party.participating: try: self.party.join() except kazoo.exceptions.KazooException: logger.exception('ZK problem while trying to join party') else: self.slaves = cycle([]) self.slave_count = 0 def update_forever(self): while True: if self.started_shutdown: return old_status = self.is_master self.update_status() new_status = self.is_master if old_status != new_status: log = logger.info else: log = logger.debug if self.is_master: log('I am the master sender') else: log('I am a slave sender') metrics.set('slave_instance_count', self.slave_count) metrics.set('is_master_sender', int(self.is_master is True)) sleep(UPDATE_FREQUENCY) def leave_cluster(self): self.started_shutdown = True # cancel any attempts to acquire master lock which could make us hang self.lock.cancel() if self.zk.state == KazooState.CONNECTED: if self.party and self.party.participating: logger.info('Leaving party') self.party.leave() if self.lock and self.lock.is_acquired: logger.info('Releasing lock') self.lock.release() def event_listener(self, state): if state == KazooState.LOST or state == KazooState.SUSPENDED: logger.info( 'ZK state transitioned to %s. Resetting master status.', state) # cancel pending attempts to acquire lock which will break and leave # us in bad state self.lock.cancel() # make us try to re-acquire lock during next iteration when we're connected if self.lock.is_acquired: self.lock.is_acquired = False # make us try to rejoin the party during next iteration when we're connected if self.party.participating: self.party.participating = False # in the meantime we're not master self.is_master = None
class MainWindow(QMainWindow, ui_MainWindow.Ui_MainWindow): mainWriteGui = pyqtSignal(str) @catchExept def __init__(self): super().__init__() self.setupUi(self) self.zk = KazooClient() self.zkTimer = QTimer(self) self.zkTimer.setInterval(100) self.zkTimer.timeout.connect(self.zkTimeout) self.zkStartThread = threading.Thread(target=self.zkConnect) self.msgBox = QMessageBox(QMessageBox.NoIcon, "Connection", "Connecting...", QMessageBox.Cancel, self) self.treeWidget.itemClicked.connect(self.itemClicked) self.treeWidget.itemDoubleClicked.connect(self.itemOpen) self.tabWidget.tabCloseRequested.connect(self.closeTab) self.actionConnect.triggered.connect(self.msgBox.show) self.actionConnect.triggered.connect(self.zkStartThread.start) self.actionConnect.triggered.connect(self.zkTimer.start) self.actionDisconnect.triggered.connect(self.zkDisconnect) self.actionACLVersion.triggered.connect(self.aclVersion) self.actionCreated.triggered.connect(self.created) self.actionChildrenCount.triggered.connect(self.childrenCount) self.actionDataLength.triggered.connect(self.dataLength) self.actionLastModified.triggered.connect(self.lastModified) self.actionLastModifiedTransactionId.triggered.connect( self.lastModifiedTransactionId) self.actionOwnerSessionId.triggered.connect(self.ownerSessionId) self.actionVersion.triggered.connect(self.version) self.actionCreationTransactionId.triggered.connect( self.creationTransactionId) self.actionChangeServerAddress.triggered.connect( self.changeServerAddress) self.msgBox.rejected.connect(self.zkTimer.stop) self.msgBox.rejected.connect(self.msgBox.hide) self.msgBox.rejected.connect(self.zkDisconnect) self.mainWriteGui.connect(self.slotMainWriteGui) self.log.setCenterOnScroll(True) self.dialog = SelectorDialog(self) class PlainTextWidgetHandler: def __init__(self, logToWriteGui): self.logToWriteGui = logToWriteGui def write(self, text): self.logToWriteGui(text) def flush(self): pass logging.basicConfig(format='%(asctime)s.%(msecs)d: %(message)s', datefmt='%H:%M:%S', level=logging.DEBUG, handlers=[ logging.StreamHandler( PlainTextWidgetHandler( self.logToWriteGui)), logging.StreamHandler(sys.stderr) ]) self.treeWidget.setColumnCount(1) self.treeWidget.sortByColumn(0, Qt.AscendingOrder) l = self.msgBox.layout() progress = QProgressBar() progress.setMaximum(0) progress.setMinimum(0) l.addWidget(progress, l.rowCount() - 2, 1, 1, l.columnCount()) self.actionConnect.setEnabled(False) if os.path.exists("config.txt"): with open("config.txt", "r") as f: prelines = [string.strip() for string in f.readlines()] lines = [prelines[0]] for i in range(1, len(prelines)): if prelines[i - 1] != prelines[i] and prelines[i] not in lines: lines.append(prelines[i]) lines = list(filter(None, lines)) self.dialog.comboBox.addItems(lines) self.actionConnect.setEnabled(True) @pyqtSlot(str) def slotMainWriteGui(self, text): self.log.ensureCursorVisible() self.log.textCursor().insertText(text) def logToWriteGui(self, text): self.mainWriteGui.emit(text) @catchExept def getCurrentStat(self): _, stat = self.zk.get(self.treeWidget.currentItem().text(1)) return stat @catchExept @pyqtSlot() def aclVersion(self): self.print("ACL version: %s" % self.getCurrentStat().acl_version) @catchExept @pyqtSlot() def created(self): self.print("Created: %s" % self.getCurrentStat().created) @catchExept @pyqtSlot() def childrenCount(self): self.print("Children count: %s" % self.getCurrentStat().children_count) @catchExept @pyqtSlot() def dataLength(self): self.print("Data length: %s" % self.getCurrentStat().data_length) @catchExept @pyqtSlot() def lastModified(self): self.print("Last modified: %s" % self.getCurrentStat().last_modified) @catchExept @pyqtSlot() def lastModifiedTransactionId(self): self.print("Last modified transactionId: %s" % self.getCurrentStat().last_modified_transaction_id) @catchExept @pyqtSlot() def ownerSessionId(self): self.print("Owner sessionId: %s" % self.getCurrentStat().owner_session_id) @catchExept @pyqtSlot() def version(self): self.print("Version: %s" % self.getCurrentStat().version) @catchExept @pyqtSlot() def creationTransactionId(self): self.print("Creation transactionId: %s" % self.getCurrentStat().creation_transaction_id) @catchExept @pyqtSlot() def changeServerAddress(self): code = self.dialog.exec_() if code == QDialog.Accepted: text = self.currentHost() with open("config.txt", "w") as f: hosts = [ self.dialog.comboBox.itemText(s) for s in range(self.dialog.comboBox.count()) if text != self.dialog.comboBox.itemText(s) ] self.dialog.comboBox.clear() self.dialog.comboBox.addItems(hosts) hosts.insert(0, text) f.write('\n'.join(hosts)) if text != "": if text != self.dialog.comboBox.itemText(0): self.dialog.comboBox.insertItem(0, text) self.dialog.comboBox.setCurrentText(text) self.print("Current host changed to %s" % self.currentHost()) self.actionConnect.setEnabled(True) @catchExept @pyqtSlot() def zkDisconnect(self): self.tabWidget.clear() self.treeWidget.clear() self.zk.stop() self.zk.close() self.actionDisconnect.setEnabled(False) self.menuFileInfo.setEnabled(False) self.actionConnect.setEnabled(True) self.actionChangeServerAddress.setEnabled(True) self.zkStartThread = threading.Thread(target=self.zkConnect) self.actionConnect.triggered.connect(self.zkStartThread.start) @catchExept @pyqtSlot() def zkTimeout(self): if self.zk.connected: self.zkConnected() self.zkTimer.stop() @catchExept def zkConnected(self): self.msgBox.hide() self.init() self.menuFileInfo.setEnabled(True) self.actionDisconnect.setEnabled(True) self.actionConnect.setEnabled(False) self.actionChangeServerAddress.setEnabled(False) @catchExept def zkConnect(self): self.zk.set_hosts(self.currentHost()) self.zk.add_listener(self.my_listener) try: self.zk.start_async() except Exception as e: logging.exception("error: {0}".format(e)) def currentHost(self): return self.dialog.comboBox.currentText() @catchExept def init(self): for child in self.zk.get_children("/"): self.treeWidget.addTopLevelItem( QTreeWidgetItem([child, "/" + child, child])) @catchExept def my_listener(self, state): if state == KazooState.LOST: # Register somewhere that the session was lost self.print("state is LOST!") elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper self.print("state is SUSPENDED!") else: # Handle being connected/reconnected to Zookeeper self.print("state is CONNECTED!") @catchExept def print(self, text): logging.debug(text) @catchExept def printAllChildren(self, curPath, children, layer): spaces = " " * layer for child in children: newPath = curPath + "/" + child data, stat = self.zk.get(newPath) self.print("%s: %s" % (spaces + child, data)) self.printAllChildren(newPath, self.zk.get_children(newPath), layer + 1) @catchExept @pyqtSlot(int) def closeTab(self, idx): self.tabWidget.removeTab(idx) @catchExept @pyqtSlot(QTreeWidgetItem, int) def itemOpen(self, item, column): if not self.zk.exists(item.text(1)): return tabName = item.text(2) for i in range(self.tabWidget.count()): if tabName == self.tabWidget.tabText(i): self.tabWidget.setCurrentIndex(i) return innerText = QPlainTextEdit() innerText.setReadOnly(True) data, stat = self.zk.get(item.text(1)) innerText.setPlainText(data.decode("utf8")) pos = self.tabWidget.addTab(innerText, tabName) self.tabWidget.setCurrentIndex(pos) @catchExept def drawAllTree(self): if self.zk.exists("/"): root = self.zk.get_children("/") self.printAllChildren("/", root, 0) else: self.print("Really?.. How?.. Why?..") @catchExept @pyqtSlot(QTreeWidgetItem, int) def itemClicked(self, item, column): item.setText( 0, item.text(2) + " (%s)" % self.getCurrentStat().children_count) children = item.takeChildren() newChildren = [] if self.zk.exists(item.text(1)): for child in self.zk.get_children(item.text(1)): for oldChild in children: if oldChild.text(2) == child: newChildren.append(oldChild) break else: newChildren.append( QTreeWidgetItem( [child, item.text(1) + "/" + child, child])) item.addChildren(newChildren)
class ScoutsDaemon(threading.Thread): def __init__(self, server, timeout): super(ScoutsDaemon, self).__init__() self.logger = base_logger.getChild(self.__class__.__name__) self._server = server self._timeout = timeout self._zk = None self._scouts = {} self.terminated = False self._event = threading.Event() self.setDaemon(True) signal.signal(signal.SIGTERM, self._terminate) self._connect() self.start() def _connect(self): if self._zk and self._zk.connected: self.logger.info('[Connection] Kazoo client is already running') return else: self.logger.info( '[Connection] Starting Kazoo client (server="%s")' % self._server) self._zk = KazooClient(hosts=self._server, timeout=self._timeout) self._zk.add_listener(self._conn_listener) event = self._zk.start_async() event.wait(timeout=self._timeout) if self._zk.connected: self.logger.info( '[Connection] Kazoo client successfully connected') else: self._zk.stop() self._event.set() raise ConnectException('Failed connecting to Zookeeper') def _conn_listener(self, state): self.logger.info('[Connection] New state: %s' % state) def _terminate(self, signum, frame): if signum == signal.SIGTERM: self.logger.info('[General] Received SIGTERM, stopping...') self._event.set() def run(self): self._zk.ensure_path(CONFS_PATH) self._setup_scouts() while not self._event.is_set(): self._event.wait(1) for scout in self._scouts.values(): scout.stop() self.terminated = True self.logger.info('[General] Shutting down...') def _setup_scouts(self, event=None): services = self._zk.get_children(CONFS_PATH, watch=self._setup_scouts) self.logger.info('[Scouts] Found confs for: %s' % services) for scouted_service in self._scouts: if scouted_service not in services: self.logger.info( '[Scouts] Service "%s" not in confs, removing its scout') self._scouts[scouted_service].stop() self._scouts.pop(scouted_service) for service in services: self._setup_scout(service) def _setup_scout(self, service, event=None): data, stat = self._zk.get( "%s/%s" % (CONFS_PATH, service), watch=lambda ev: self._setup_scout(service, ev)) conf = json.loads(data) if service in self._scouts: self.logger.info('[Scouts] New conf for %s' % service) self._scouts[service].set_conf(conf) else: self.logger.info('[Scouts] Creating a scout for %s' % service) scout = ServiceScout(zk=self._zk, service=service, cmd=conf['cmd'], zk_path=conf['zk_path'], refresh=conf['refresh']) self._scouts[service] = scout
# @Time : 2018/11/19 21:37 # @describe : kazoo的异步用法,异步使用官网文档的回调及CURD操作有问题,待后续学习? import sys from kazoo.client import KazooClient from kazoo.handlers.gevent import SequentialGeventHandler # from kazoo.handlers.eventlet import SequentialEventletHandler from kazoo.exceptions import ConnectionLossException from kazoo.exceptions import NoAuthException # 建立连接,Kazoo不依赖于gevent的monkey补丁,并且要求传入适当的处理程序,默认为SequentialGeventHandler() # eventlet也同上 zk = KazooClient(hosts='39.107.88.145:2181', timeout=1, handler=SequentialGeventHandler()) event = zk.start_async() event.wait(timeout=1) # wait()方法等待start_async()返回的事件对象 if not zk.connected: # 由于可能永远连接失败,因此判断连接状态,做异常情况处理 zk.stop() raise Exception("Unable to connect") def my_callback(async_obj): try: print '-------------------------' children = async_obj.get() do_something(children) except (ConnectionLossException, NoAuthException): sys.exit(1)
class ZookeeperServiceRegistry(BaseServiceRegistry): def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT): super(ZookeeperServiceRegistry, self).__init__() self.chroot = chroot self.client = KazooClient( hosts=hosts, handler=SequentialGeventHandler(), ) self.client.add_listener(self.on_kazoo_state_change) self.start_count = 0 @classmethod def from_config(cls, config, **kwargs): return cls( hosts=config.get('hosts', DEFAULT_HOSTS), chroot=config.get('chroot', DEFAULT_CHROOT), **kwargs ) def on_start(self, timeout=10): self.start_count += 1 if self.start_count > 1: return started = self.client.start_async() started.wait(timeout=timeout) if not self.client.connected: raise RuntimeError('could not connect to zookeeper') logger.debug('connected to zookeeper (version=%s)', '.'.join(map(str, self.client.server_version()))) def on_stop(self): self.start_count -= 1 if self.start_count != 0: return self.client.stop() def on_kazoo_state_change(self, state): logger.info('kazoo connection state changed to %s', state) def on_service_type_watch(self, service, event): try: if event.type == EventType.CHILD: # FIXME: figure out proper retry strategy self.client.retry(self.lookup, service.container, service) except Exception: logger.exception('error in service type watcher') def on_service_watch(self, service, event): try: prefix, service_type, identity = event.path.rsplit('/', 2) if event.type == EventType.DELETED: service.remove(identity) except Exception: logger.exception('error in service watcher') def _get_service_znode(self, service, service_type, identity): path = self._get_zk_path(service_type, identity) result = self.client.get_async( path, watch=functools.partial(self.on_service_watch, service)) value, znode = result.get() items = six.iteritems(json.loads(value.decode('utf-8'))) return {str(k): str(v) for k, v in items} def discover(self, container): result = self.client.get_children_async( path='%s/services' % self.chroot, ) return list(result.get()) def lookup(self, container, service, watch=True, timeout=1): def child_watch(event): print(event) service_type = service.service_type result = self.client.get_children_async( path='%s/services/%s' % (self.chroot, service_type), watch=functools.partial(self.on_service_type_watch, service), ) try: names = result.get(timeout=timeout) except NoNodeError: raise LookupFailure(None, "failed to resolve %s" % service.service_type) logger.info("lookup %s %r", service_type, names) identities = set(service.identities()) for name in names: kwargs = self._get_service_znode(service, service_type, name) identity = kwargs.pop('identity') service.update(identity, **kwargs) try: identities.remove(identity) except KeyError: pass for identity in identities: service.remove(identity) return service def _get_zk_path(self, service_type, identity): return '%s/services/%s/%s' % (self.chroot, service_type, identity) def register(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) value = json.dumps({ 'endpoint': container.endpoint, 'identity': container.identity, 'log_endpoint': container.log_endpoint, }) result = self.client.create_async( path, value.encode('utf-8'), ephemeral=True, makepath=True) # FIXME: result.set_exception(RegistrationFailure()) result.get(timeout=timeout) def unregister(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) result = self.client.delete_async(path) result.set_exception(RegistrationFailure()) result.get(timeout=timeout)
class Coordinator(object): def __init__(self, zk_hosts, hostname, port, join_cluster): self.me = '%s:%s' % (hostname, port) self.is_master = None self.slaves = cycle([]) self.slave_count = 0 self.started_shutdown = False if join_cluster: read_only = False else: read_only = True self.zk = KazooClient(hosts=zk_hosts, handler=SequentialGeventHandler(), read_only=read_only) event = self.zk.start_async() event.wait(timeout=5) self.lock = self.zk.Lock(path='/iris/sender_master', identifier=self.me) # Used to keep track of slaves / senders present in cluster self.party = Party(client=self.zk, path='/iris/sender_nodes', identifier=self.me) if join_cluster: self.party.join() def am_i_master(self): return self.is_master # Used for API to get the current master def get_current_master(self): try: contenders = self.lock.contenders() except kazoo.exceptions.KazooException: logger.exception('Failed getting contenders') return None if contenders: return self.address_to_tuple(contenders[0]) else: return None # Used for API to get the current slaves if master can't be reached def get_current_slaves(self): return [self.address_to_tuple(host) for host in self.party] def address_to_tuple(self, address): try: host, port = address.split(':') return host, int(port) except (IndexError, ValueError): logger.error('Failed getting address tuple from %s', address) return None def update_status(self): if self.started_shutdown: return if self.zk.state == KazooState.CONNECTED: if self.is_master: self.is_master = self.lock.is_acquired else: try: self.is_master = self.lock.acquire(blocking=False, timeout=2) except kazoo.exceptions.LockTimeout: self.is_master = False logger.exception( 'Failed trying to acquire lock (shouldn\'t happen as we\'re using nonblocking locks)' ) except kazoo.exceptions.KazooException: self.is_master = False logger.exception( 'ZK problem while Failed trying to acquire lock') else: logger.error('ZK connection is not in connected state') self.is_master = False if self.is_master: slaves = [ self.address_to_tuple(host) for host in self.party if host != self.me ] self.slave_count = len(slaves) self.slaves = cycle(slaves) else: self.slaves = cycle([]) self.slave_count = 0 def update_forever(self): while True: if self.started_shutdown: return old_status = self.is_master self.update_status() new_status = self.is_master if old_status != new_status: log = logger.info else: log = logger.debug if self.is_master: log('I am the master sender') else: log('I am a slave sender') metrics.set('slave_instance_count', self.slave_count) metrics.set('is_master_sender', int(self.is_master)) sleep(UPDATE_FREQUENCY) def leave_cluster(self): self.started_shutdown = True if self.party and self.party.participating: logger.info('Leaving party') self.party.leave() if self.lock and self.lock.is_acquired: logger.info('Releasing lock') self.lock.release()
class Zookeeper: def __init__(self, hosts): self.zk = KazooClient(hosts=hosts, handler=SequentialGeventHandler(), logger=logger) # returns immediately event = self.zk.start_async() # Wait for 30 seconds and see if we're connected event.wait(timeout=30) try: if not self.zk.connected: # Not connected, stop trying to connect self.zk.stop() except (ConnectionLossException, NoAuthException) as error: raise error except Exception as error: raise error @coroutine def get_children(self, node): try: children = self.zk.get_children_async(node) raise Return(children.get()) except Exception as error: raise error @coroutine def get_node(self, node): try: data = self.zk.get_async(node) raise Return(data.get()) except Exception as error: raise error @coroutine def check_path_exist(self, path): try: result = self.zk.exists(path) if result: raise Return(True) else: raise Return(False) except Exception as error: raise error @coroutine def create_path(self, path): try: result = self.zk.ensure_path_async(path) raise Return(result.get()) except Exception as error: raise error @coroutine def create_node(self, path, value): try: result = self.zk.create_async(path=path, value=value, acl=None, ephemeral=True) raise Return(result.get()) except Exception as error: raise error @coroutine def update_node(self, path, value, version=-1): try: result = self.zk.set_async(path, value, version) raise Return(result.get()) except Exception as error: raise error @coroutine def update_node(self, path, value, version=-1): try: result = self.zk.set_async(path, value, version) raise Return(result.get()) except Exception as error: raise error @coroutine def del_node(self, node): try: node_info = self.zk.delete_async(node) raise Return(node_info.get()) except Exception as error: raise error def close(self): self.zk.stop()
class ZooHandler(object): def __init__(self): self.zookeeper_client = None if not settings.ZOOKEEPER_SETTING['enable']: logging.info('zookeeper disabled') return self.zoo_hosts = settings.ZOOKEEPER_SETTING['server_address'] logging.info('start zookeeper client, zoo hosts: %s' % self.zoo_hosts) self.base_dir = settings.ZOOKEEPER_SETTING['base_dir'] self.zookeeper_client = KazooClient(hosts=self.zoo_hosts) self.zookeeper_client.add_listener(self.state_listener) self.zookeeper_client.start_async() def state_listener(self, state): # session was lost if state == KazooState.LOST: logging.error('zookeeper lost!') # disconnected from Zookeeper elif state == KazooState.SUSPENDED: logging.error('zookeeper disconnected!') # connected/reconnected to Zookeeper elif state == KazooState.CONNECTED: self.register_node() logging.warn('zookeeper reconnected! try to register') else: logging.error('unexpected zookeeper state!!!') logging.critical('unexpected zookeeper state!!!') def register_node(self): if not self.zookeeper_client or not self.zookeeper_client.connected: logging.error('zoo not connected, register cancel') return path = ZooHandler.get_register_path() try: # 尝试注册节点 def try_to_create_node(result): logging.info('zoo try_to_create_noe called') try: # None表示节点不存在 if result.value is None: self.zookeeper_client.create_async(path, makepath=True, ephemeral=True) elif result.exception: logging.fatal( 'critical error when try to check node when reconnected, %s', result.exception) else: logging.warn( 'node already exists when reconnect and try to register' ) except BaseException as e: logging.exception('critical error, %s', e.message) # 监控节点变化 def node_watcher(watch_event): logging.info('zoo node_watcher called') try: if EventType.DELETED == watch_event.type: logging.warn('zoo nodes deleted, try recreate') self.zookeeper_client.create_async(path, makepath=True, ephemeral=True) if EventType.CHANGED == watch_event.type: logging.warn('zoo nodes changed,do nothing') if EventType.CHILD == watch_event.type: logging.warn('zoo nodes childed,do nothing') if EventType.CREATED == watch_event.type: logging.info('zoo nodes success created') if EventType.NONE == watch_event.type: logging.error('zoo nodes status return None') finally: self.zookeeper_client.exists_async(path, watch=node_watcher) future = self.zookeeper_client.exists_async(path, watch=node_watcher) future.rawlink(try_to_create_node) except ZookeeperError as e: logging.exception('zookeeper exception when register node: %s' % e.message) except BaseException as e: logging.exception('critical error!') # 1. remove nodes,stop client def stop(self): logging.info('stopping zookeeper client') if self.zookeeper_client: self.zookeeper_client.remove_listener(self.state_listener) self.zookeeper_client.stop() logging.info('zookeeper stopped') @staticmethod def get_register_path(): base_dir = settings.ZOOKEEPER_SETTING['base_dir'] if base_dir[-1] == '/': base_dir = base_dir[0:-1] register_name = "%s/%s:%s:%s" % ( base_dir, settings.ZOOKEEPER_SETTING['local_name'], settings.ZOOKEEPER_SETTING['local_ip'], settings.HTTP_SERVER_SETTING['port']) return register_name
class Pool(object): """ A pool represents a set of resources and nodes that own/manage those resources. The pool class is responsible for tracking state of all nodes and resources within the entire pool. """ def __init__(self, name, hosts='127.0.0.1:2181'): self.name = name self.path = '/carousel/{}'.format(name) self.hosts = hosts # Generic metadata tracked for the entire pool self.nodes = set() self.resources = set() self.zk = None if hosts: self.connect(hosts) def _on_resources_change(self, res): self.resources = set(res) def _on_nodes_change(self, res): self.nodes = set(res) @property def healthy(self): resources_with_leaders = set( self.zk.get_children(os.path.join(self.path, 'leaders'))) resources_without_leaders = self.resources - resources_with_leaders return not len(resources_without_leaders) def create(self, metadata={}): # Create the base pool path with metadata self.zk.create(self.path, str.encode(json.dumps(metadata)), makepath=True) for path in ['resources', 'nodes', 'leaders']: self.zk.create(os.path.join(self.path, path)) self.load() def load(self): # Check whether the pool exists if not self.zk.exists(self.path): raise PoolException("Pool with name {} does not exist!".format( self.name)) # Next load the pool meta-data self.meta, self.meta_stat = self.zk.get(self.path) self.meta = json.loads(self.meta.decode()) # Finally, we need to keep track of resources and nodes ChildrenWatch(self.zk, os.path.join(self.path, 'resources'), self._on_resources_change) ChildrenWatch(self.zk, os.path.join(self.path, 'nodes'), self._on_nodes_change) def connect(self, hosts, timeout=4): self.zk = KazooClient(hosts, timeout=timeout, handler=SequentialGeventHandler()) self.zk.start_async().wait(timeout=5) if not self.zk.connected: self.zk.stop() raise Exception('Failed to reach zookeeper') try: self.load() except PoolException: self.create() def disconnect(self): self.zk.stop() def ensure_resources(self, *resources): for resource in resources: self.ensure_resource(resource) def ensure_resource(self, name, metadata=None): try: self.zk.create(os.path.join(self.path, 'resources', name), json.dumps(metadata or {})) except NodeExistsError: pass def delete_resource(self, name): assert name in self.resources self.zk.delete(os.path.join(self.path, 'resources', name)) try: self.zk.delete(os.path.join(self.path, 'leaders', name)) except NoNodeError: pass def create_node(self, metadata=None): return Node(self, metadata or {}) def get_leader(self, resource): result, _ = self.zk.get(os.path.join(self.path, 'leaders', resource)) return result
class kazooMaster(object): def __init__(self,ip,type_="p",node="",userID="",pid="",operation="",remap=False): self.ip = ip self.node = node self.type=type_ self.userID = userID self.productID = pid self.operation = operation self.path_rev = "" if type_ == "e" or type_ == "E": self.path = "/"+self.node else: self.path_rev = "/"+self.node+"/"+self.userID+"/"+self.productID self.path = "/"+self.userID+"/"+self.productID+"/"+self.node self.version="" self.remap = remap self.start_client() def start_client(self): self.zk = KazooClient(hosts='{}:2181'.format(self.ip), read_only = False) self.zk.start() def start_client_async(self): self.zk = KazooClient(hosts='{}:2181'.format(self.ip), read_only = False) self.zk.start_async() def children_watch(self, path = None): if path is None: path = self.path print(path) @self.zk.ChildrenWatch(path) def watch_children(children): print("Children are now: %s" % children) # Above function called immediately, and from then on def stop_client(self): self.zk.stop() def get_children(self,path): print(path, "in get children") if self.zk.exists(path) == None: return "" else: return self.zk.get_children(path) def exist(self,path): if self.zk.exists(path) == None: return False else: return True def create(self, path, param = "p"): ephemeral = False if param == "p": ephemeral = False elif param == "e": ephemeral = True logging.basicConfig(filename='logs/connection.log', filemode='w', level=logging.DEBUG) if(self.path == ""): logging.error("PATH EMPTY") return False else: if self.zk.exists(path) == None: self.zk.create(path, value=b"0", makepath=True, ephemeral = ephemeral) # self.zk.create(self.path_rev,value=b"0",makepath=True) return True return False #stat is blocking ,control will return to called object after ephemeral node crashes def stat(self): stop=4 @self.zk.DataWatch("{}".format(self.path)) def my_func(data,stat): nonlocal stop stop = stop -1 if stat is None: return print("changed") print("Data is {} ".format(data)) print("Version is {} ".format(stat.version)) if self.zk.exists(self.path) !=None: val=DataWatch(self.zk,self.path,func=my_func) if not val: return -1 while stop > 0: continue else: print("PATH INVALID") self.zk.stop() def delete(self, node_name): if self.zk.exists(node_name): print("Node {} in ip {} exists".format(node_name)) else: print("Node {} in ip {} does not exists".format(node_name)) raise Exception("Node does not exists") try: self.zk.delete(node_name, recursive = True) except Exception as e: logging.info("Error whle updating Node " + node_name) self.zk.stop() def retrieve(self, custom_path = None): if not custom_path: custom_path = self.path if self.zk.exists(self.path) == None: return -1 else: data,version_number = self.zk.get(custom_path) version_number = str(version_number.version) return version_number #Give only user ID for finding mapping def getmap(self): parent = self.userID parent = "/"+parent if not self.zk.exists(parent): return [] children = self.zk.get_children(parent) to_return = [] for keys in children: #print("KEY: ",keys) subChildren = parent+"/"+keys if not self.zk.exists(subChildren): continue subChild = self.zk.get_children(subChildren) for val in subChild: path = subChildren+"/"+val to_return.append({ "key": keys, "device": val, "version": self.retrieve(path) }) return to_return #give only Device Id for remapping def reMap(self): remap_data=[] if self.remap == True: dev_down=self.node dev_down = "/"+dev_down val = self.zk.get_children(dev_down) for users in val: user_name = dev_down+"/"+users allItems = self.zk.get_children(user_name) for items in allItems: remap_data.append((users,items)) #TODO:delete user key pairs here print(users,items) else: print("REMAP PARAMETER FALSE") return remap_data def setVersion(self,path,value): if self.zk.exists(path) == None: print("Path does not exists setversion method") else: try: self.zk.set(path=path,value=str(value).encode()) except Exception as e: print(e, "Exception in set versioning")
class ZookeeperServiceRegistry(BaseServiceRegistry): def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT): super(ZookeeperServiceRegistry, self).__init__() self.chroot = chroot self.client = KazooClient( hosts=hosts, handler=SequentialGeventHandler(), ) self.client.add_listener(self.on_kazoo_state_change) self.start_count = 0 @classmethod def from_config(cls, config, **kwargs): return cls(hosts=config.get('hosts', DEFAULT_HOSTS), chroot=config.get('chroot', DEFAULT_CHROOT), **kwargs) def on_start(self, timeout=10): self.start_count += 1 if self.start_count > 1: return started = self.client.start_async() started.wait(timeout=timeout) if not self.client.connected: raise RuntimeError('could not connect to zookeeper') logger.debug('connected to zookeeper (version=%s)', '.'.join(map(str, self.client.server_version()))) def on_stop(self): self.start_count -= 1 if self.start_count != 0: return self.client.stop() def on_kazoo_state_change(self, state): logger.info('kazoo connection state changed to %s', state) def on_service_type_watch(self, service, event): try: if event.type == EventType.CHILD: # FIXME: figure out proper retry strategy self.client.retry(self.lookup, service.container, service) except Exception: logger.exception('error in service type watcher') def on_service_watch(self, service, event): try: prefix, service_type, identity = event.path.rsplit('/', 2) if event.type == EventType.DELETED: service.remove(identity) except Exception: logger.exception('error in service watcher') def _get_service_znode(self, service, service_type, identity): path = self._get_zk_path(service_type, identity) result = self.client.get_async(path, watch=functools.partial( self.on_service_watch, service)) value, znode = result.get() items = six.iteritems(json.loads(value.decode('utf-8'))) return {str(k): str(v) for k, v in items} def discover(self, container): result = self.client.get_children_async(path='%s/services' % self.chroot, ) return list(result.get()) def lookup(self, container, service, watch=True, timeout=1): def child_watch(event): print(event) service_type = service.service_type result = self.client.get_children_async( path='%s/services/%s' % (self.chroot, service_type), watch=functools.partial(self.on_service_type_watch, service), ) try: names = result.get(timeout=timeout) except NoNodeError: raise LookupFailure(None, "failed to resolve %s" % service.service_type) logger.info("lookup %s %r", service_type, names) identities = set(service.identities()) for name in names: kwargs = self._get_service_znode(service, service_type, name) identity = kwargs.pop('identity') service.update(identity, **kwargs) try: identities.remove(identity) except KeyError: pass for identity in identities: service.remove(identity) return service def _get_zk_path(self, service_type, identity): return '%s/services/%s/%s' % (self.chroot, service_type, identity) def register(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) value = json.dumps({ 'endpoint': container.endpoint, 'identity': container.identity, 'log_endpoint': container.log_endpoint, }) result = self.client.create_async(path, value.encode('utf-8'), ephemeral=True, makepath=True) # FIXME: result.set_exception(RegistrationFailure()) result.get(timeout=timeout) def unregister(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) result = self.client.delete_async(path) result.set_exception(RegistrationFailure()) result.get(timeout=timeout)
class ZookeeperLocks(object): """ Zookeeper lock class that finds and removes locks from a Hive object Adapted from here: https://etl.svbtle.com/removing-database-level-locks-in-hive """ def __init__(self, hosts, database, table): self.zk_client = KazooClient(hosts=hosts) # TODO: Move this out of the init function # Currently, this only looks at the table level and # assumes that the locks will be under there - # partitions tables will have nested dirs for the # partition (eg. domain=x, table=y) self.startup() self.database = database self.table = table def startup(self): # https://kazoo.readthedocs.io/en/latest/async_usage.html # returns immediately event = self.zk_client.start_async() # Wait for 30 seconds and see if we're connected event.wait(timeout=20) if not self.zk_client.connected: # Not connected, stop trying to connect self.zk_client.stop() raise Exception("Unable to connect.") def shutdown(self): """ Close the connection to Zookeeper """ self.zk_client.stop() def show_all_children(self): """ Get a list of child nodes of a path. :return: List of child node names """ return self.zk_client.get_children('/') def get_hive_namespace(self): """ Find the Hive Zookeeper name space in all the children of the Zookeeper :return: string, name of Hive Name Space """ hive_namespace = None for child in self.show_all_children(): match = re.search(r'hive_zookeeper_namespace_hive.?', child) if match: hive_namespace = match.group() break return hive_namespace def path_setup(self): """setup path""" return "/{0}/{1}".format(self.get_hive_namespace(), self.database) def my_rec(self, name_of_lock): """ Recursively goes through and remove locks at all levels. Locks can be at the table or at a partition level. In the case of some tables, we are 2 partitions deep. It is also possible to have more than 1 lock on a given partition. """ # we are getting unicode # all locks start with LOCK- locks_deleted = [] if 'LOCK-' in name_of_lock.encode('ascii', 'ignore'): try: self.zk_client.delete(name_of_lock) print 'Deleted lock: {0}'.format(name_of_lock) return "Deleted " + name_of_lock except NoNodeError: print "No node error - delete lock: {0}".format(name_of_lock) else: new_locks = "" print "Need to go deeper", name_of_lock try: new_locks = self.zk_client.get_children(name_of_lock + "/") # this will return a list, but finish # if the list is empty except NoNodeError: print "No node error - get_children: {0}".format(name_of_lock) if len(new_locks) > 0: for a_lock in new_locks: # for the next depth, go through # and remove anything that starts # with "LOCK-" deleted_lock = self.my_rec(name_of_lock + "/" + a_lock) locks_deleted.append(deleted_lock) else: return "All done" return locks_deleted
class ServiceWatcher: def __init__(self, hosts, timeout): self._logger = logging.getLogger(self.__class__.__name__) self._endpoint = None self._conf = None self._zk = KazooClient(hosts=hosts) event = self._zk.start_async() event.wait(timeout=timeout) if self._zk.connected: self._logger.info('Kazoo client successfully connected') self._init_conf_node() self._setup_conf() self._setup_data_endpoint() else: self._zk.stop() self._logger.error('Kazoo client failed to connect') def _init_conf_node(self): default_conf = {'last_n': 100, 'repeat_seconds': 30} conf_json = json.dumps(default_conf).encode('utf-8') try: self._zk.create(CONF_NODE, conf_json) self._logger.warning( 'No configuration found at path %s, setting default %s' % (CONF_NODE, conf_json)) except NodeExistsError: pass def _setup_data_endpoint(self, event=None): self._zk.ensure_path(DATA_PARENT_NODE) endpoints = self._zk.get_children(DATA_PARENT_NODE, watch=self._setup_data_endpoint) if len(endpoints) == 0: self._logger.error('No available endpoints found') elif not self._endpoint or self._endpoint.split( ':')[0] not in endpoints: self._logger.info('Found %s data service endpoints: %s' % (len(endpoints), endpoints)) self._set_endpoint(endpoints[0]) def _set_endpoint(self, endpoint, event=None): full_path = '%s/%s' % (DATA_PARENT_NODE, endpoint) data_bytes, stat = self._zk.get(full_path) data = json.loads(data_bytes.decode('utf-8')) self._endpoint = '%s:%s' % (endpoint, data['port']) self._logger.info('Endpoint set to %s, which is running since %s' % (self._endpoint, data['started'])) @property def endpoint(self): return self._endpoint def _setup_conf(self, event=None): data_bytes, stat = self._zk.get(CONF_NODE, watch=self._setup_conf) self._conf = json.loads(data_bytes.decode('utf-8')) self._logger.info('New configuration found: %s' % self._conf) @property def conf(self): return self._conf
class Node(object): def __init__(self, pool, metadata=None, max_inflight_acquires=1, auto_acquire=True): self.pool = pool self.zk = KazooClient(pool.hosts, timeout=5, handler=SequentialGeventHandler()) event = self.zk.start_async() event.wait(timeout=5) if not self.zk.connected: self.zk.stop() raise Exception('Failed to reach zookeeper') self.metadata = metadata or {} self.id = None self.path = None self.auto_acquire = auto_acquire self.max_resources = 0 # Set of resources we own self.resources = set() self._resource_backoff = {} self._resources_acquiring = gevent.lock.Semaphore(max_inflight_acquires) # Callbacks self.on_acquire_resource = None self.on_release_resource = None self._anti_entropy_greenlet = gevent.spawn(self._anti_entropy) def disconnect(self): self.zk.disconnect() def acquire(self, resource): assert resource in self.pool.resources return self._try_takeover(resource, force=True) def release(self, resource): assert resource in self.resources # TODO: transaction here self.zk.delete(os.path.join(self.pool.path, 'leaders', resource)) def leave(self): for resource in list(self.resources): self.release(resource) def join(self): path = self.zk.create(os.path.join(self.pool.path, 'nodes', ''), ephemeral=True, sequence=True) self.path = path self.id = path.rsplit('/', 1)[-1] # Watch for leadership changes so we can possibly take over ChildrenWatch(self.zk, os.path.join(self.pool.path, 'leaders'), self._on_leaders_change) # Now that we've joined, lets see if there are any dangling resources we # can take ownership of gevent.spawn(self._check_for_takeover, delay=0) def _on_leaders_change(self, data): # TODO: debounce this instead of just sleeping gevent.spawn(self._check_for_takeover, delay=5) def _on_resource_leader_change(self, data, stat, event): if not event: return resource_name = event.path.split('/')[-1] if resource_name not in self.pool.resources: return if resource_name in self.resources: if event.type == 'DELETED' or data != self.id: self._resource_backoff[resource_name] = time.time() self.resources.remove(resource_name) if callable(self.on_release_resource): self.on_release_resource(self, resource_name) return False if event.type == 'DELETED': self._try_takeover(resource_name) def _check_for_takeover(self, delay=5): if not self.auto_acquire: return time.sleep(delay) resources_with_leaders = set(self.zk.get_children(os.path.join(self.pool.path, 'leaders'))) resources_without_leaders = self.pool.resources - resources_with_leaders for resource in resources_without_leaders: self._try_takeover(resource) # If we have more than the even-split number of resources, backoff a bit if len(self.resources) > len(self.pool.resources) / len(self.pool.nodes): time.sleep(1) def _try_takeover(self, resource, force=False): if self.max_resources and len(self.resources) >= self.max_resources: return False if not force and resource in self._resource_backoff: if time.time() - self._resource_backoff[resource] < 10: return False del self._resource_backoff[resource] if self._resources_acquiring.locked(): return False with self._resources_acquiring: path = os.path.join(self.pool.path, 'leaders', resource) try: self.zk.create(path, unicode.encode(self.id), ephemeral=True) except NodeExistsError: if not force: return False _, metadata = self.zk.get(path) transaction = self.zk.transaction() transaction.delete(path, version=metadata.version) transaction.create(path, unicode.encode(self.id), ephemeral=True) result = transaction.commit() if result[0] is not True or result[1] != path: return False DataWatch(self.zk, path, self._on_resource_leader_change) self.resources.add(resource) if callable(self.on_acquire_resource): self.on_acquire_resource(self, resource) return True def balance(self): threshold = math.ceil(len(self.pool.resources) / (len(self.pool.nodes) * 1.0)) our_value = len(self.resources) if our_value > threshold + 1: resource = random.choice(list(self.resources)) self._resource_backoff[resource] = time.time() self.release(resource) def _anti_entropy(self): while True: time.sleep(10) self.balance()