class PingPongServer(Observable): PING_DELAY = Amount(1, Time.SECONDS) def __init__(self, target_host, target_port, clock=time): self._clock = clock self._target = (target_host, target_port) self._pings = AtomicGauge('pings') self.metrics.register(self._pings) def send_request(self, endpoint, message, ttl): url_base = 'http://%s:%d' % self._target try: urllib2.urlopen('%s/%s/%s/%d' % (url_base, endpoint, message, ttl)).read() except Exception as e: log.error('Failed to query %s: %s' % (url_base, e)) @HttpServer.route('/ping/:message') @HttpServer.route('/ping/:message/:ttl') def ping(self, message, ttl=60): self._pings.increment() log.info('Got ping (ttl=%s): %s' % (message, ttl)) ttl = int(ttl) - 1 if ttl > 0: defer(partial(self.send_request, 'ping', message, ttl), delay=self.PING_DELAY, clock=self._clock)
class TwitterKazooClient(KazooClient, Observable): @classmethod def make(cls, *args, **kw): # TODO(jcohen): Consider removing verbose option entirely in favor of just using loglevel. verbose = kw.pop('verbose', False) async = kw.pop('async', True) if verbose: loglevel = kw.pop('loglevel', logging.INFO) else: loglevel = kw.pop('loglevel', sys.maxsize) logger = logging.getLogger('kazoo.devnull') logger.setLevel(loglevel) kw['logger'] = logger zk = cls(*args, **kw) if async: zk.start_async() zk.connecting.wait() else: zk.start() return zk def __init__(self, *args, **kw): if 'connection_retry' not in kw: # The default backoff delay limit in kazoo is 3600 seconds, which is generally # too conservative for our use cases. If not supplied by the caller, provide # a backoff that will truncate earlier. kw['connection_retry'] = KazooRetry( max_delay=DEFAULT_RETRY_MAX_DELAY_SECS, **DEFAULT_RETRY_DICT) super(TwitterKazooClient, self).__init__(*args, **kw) self.connecting = threading.Event() self.__session_expirations = AtomicGauge('session_expirations') self.__connection_losses = AtomicGauge('connection_losses') self.__session_id = LambdaGauge('session_id', lambda: (self._session_id or 0)) self.metrics.register(self.__session_expirations) self.metrics.register(self.__connection_losses) self.metrics.register(self.__session_id) self.add_listener(self._observable_listener) def _observable_listener(self, state): if state == KazooState.LOST: self.__session_expirations.increment() elif state == KazooState.SUSPENDED: self.__connection_losses.increment() def _session_callback(self, state): rc = super(TwitterKazooClient, self)._session_callback(state) if state == KeeperState.CONNECTING: self.connecting.set() return rc @property def live(self): return self._live
class StatusStats(Observable): def __init__(self): self._count = AtomicGauge('count') self._ns = AtomicGauge('total_ns') self.metrics.register(self._count) self.metrics.register(self._ns) def increment(self, ns): self._count.increment() self._ns.add(ns)
class TwitterKazooClient(KazooClient, Observable): @classmethod def make(cls, *args, **kw): verbose = kw.pop('verbose', False) async = kw.pop('async', True) if verbose is False: kw['logger'] = logging.Logger('kazoo.devnull', level=sys.maxsize) zk = cls(*args, **kw) if async: zk.start_async() zk.connecting.wait() else: zk.start() return zk def __init__(self, *args, **kw): super(TwitterKazooClient, self).__init__(*args, **kw) self.connecting = threading.Event() self.__session_expirations = AtomicGauge('session_expirations') self.__connection_losses = AtomicGauge('connection_losses') self.__session_id = LambdaGauge('session_id', lambda: (self._session_id or 0)) self.metrics.register(self.__session_expirations) self.metrics.register(self.__connection_losses) self.metrics.register(self.__session_id) self.add_listener(self._observable_listener) def _observable_listener(self, state): if state == KazooState.LOST: self.__session_expirations.increment() elif state == KazooState.SUSPENDED: self.__connection_losses.increment() def _session_callback(self, state): rc = super(TwitterKazooClient, self)._session_callback(state) if state == KeeperState.CONNECTING: self.connecting.set() return rc @property def live(self): return self._live
class PingPongServer(Observable): PING_DELAY = Amount(1, Time.SECONDS) def __init__(self, target_host, target_port, clock=time): self._clock = clock self._target = (target_host, target_port) self._pings = AtomicGauge("pings") self.metrics.register(self._pings) def send_request(self, endpoint, message, ttl): url_base = "http://%s:%d" % self._target try: urllib2.urlopen("%s/%s/%s/%d" % (url_base, endpoint, message, ttl)).read() except Exception as e: log.error("Failed to query %s: %s" % (url_base, e)) @HttpServer.route("/ping/:message") @HttpServer.route("/ping/:message/:ttl") def ping(self, message, ttl=60): self._pings.increment() log.info("Got ping (ttl=%s): %s" % (message, ttl)) ttl = int(ttl) - 1 if ttl > 0: defer(partial(self.send_request, "ping", message, ttl), delay=self.PING_DELAY, clock=self._clock)
class ZooKeeper(object): """A convenience wrapper around the low-level ZooKeeper API. Blocks until the initial connection is established, and proxies method calls to the corresponding ZK functions, passing the handle. Supports both synchronous and asynchronous APIs. Syncronous API Notes: Synchronous calls will block across connection loss or session expiration until reconnected. Asynchronous API Notes: Asynchronous calls will queue up while the session/connection is unhealthy and only be dispatched while zookeeper is healthy. It is still possible for asynchronous calls to fail should the session be severed after the call has been successfully dispatched. In other words: don't assume your rc will always be zookeeper.OK. Watches will behave as normal assuming successful dispatch. In general when using this wrapper, you should retry your call if your watch is fired with EXPIRED_SESSION_STATE and ignore anything else whose state is not CONNECTED_STATE. This wrapper will never re-dispatch calls that have been sent to zookeeper without error. """ class Error(Exception): pass class ConnectionTimeout(Error): pass class InvalidEnsemble(Error): pass class Stopped(Error): pass # White-list of methods that accept a ZK handle as their first argument _ZK_SYNC_METHODS = frozenset([ 'add_auth', 'close', 'create', 'delete', 'exists', 'get', 'get_acl', 'get_children', 'is_unrecoverable', 'recv_timeout', 'set', 'set2', 'set_acl', 'set_watcher', 'state', ]) _ZK_ASYNC_METHODS = frozenset([ 'acreate', 'adelete', 'aexists', 'aget', 'aget_acl', 'aget_children', 'aset', 'aset_acl', 'async' ]) COMPLETION_RETRY = frozenset([ zookeeper.CONNECTIONLOSS, zookeeper.OPERATIONTIMEOUT, zookeeper.SESSIONEXPIRED, zookeeper.CLOSING, ]) @classmethod def expand_ensemble(cls, servers): """Expand comma-separated list of host:port to comma-separated, fully-resolved list of ip:port.""" server_ports = [] for server_port in servers.split(','): server_split = server_port.split(':', 2) if len(server_split) == 1: server, port = server_split[0], cls.DEFAULT_PORT else: try: server, port = server_split[0], int(server_split[1]) except ValueError: raise cls.InvalidEnsemble('Invalid ensemble string: %s' % server_port) try: for ip in set(socket.gethostbyname_ex(server)[2]): server_ports.append('%s:%s' % (ip, port)) except socket.gaierror: raise cls.InvalidEnsemble('Could not resolve %s' % server) return ','.join(server_ports) DEFAULT_TIMEOUT_SECONDS = 30.0 DEFAULT_ENSEMBLE = 'localhost:2181' DEFAULT_PORT = 2181 DEFAULT_ACL = ZooDefs.Acls.OPEN_ACL_UNSAFE MAX_RECONNECTS = 1 # (is live?, is stopped?) => human readable status STATUS_MATRIX = { (True, True): 'WTF', (True, False): 'OK', (False, True): 'STOPPED', (False, False): 'CONNECTING' } class Completion(object): def __init__(self, zk, function, *args, **kw): self._zk = zk self._cid = random.randint(0, sys.maxint - 1) self._logger = kw.pop('logger', log.debug) @wraps(function) def wrapper(zh): return function(zh, *args, **kw) self._fn = wrapper self._logger('Created %s args:(%s) kw:{%s}' % ( self, ', '.join(map(repr, args)), ', '.join('%s: %r' % (key, val) for key, val in kw.items()))) def __str__(self): return '%s(id:%s, zh:%s, %s)' % ( self.__class__.__name__, self._cid, self._zk._zh, self._fn.__name__) def __call__(self): try: self._logger('%s start' % self) result = self._fn(self._zk._zh) self._logger('%s success' % self) return result except TypeError as e: # Raced; zh now dead, so re-enqueue. if self._zk._zh is not None: raise self._logger('%s raced, re-enqueueing' % self) self._zk._add_completion(self._fn) except (zookeeper.ConnectionLossException, zookeeper.InvalidStateException, SystemError) as e: self._logger('%s excepted (%s), re-enqueueing' % (self, e)) self._zk._add_completion(self._fn) return zookeeper.OK # N.B.(wickman) This is code is theoretically racy. We cannot synchronize # events across the zookeeper C event loop, however we do everything in # our power to catch transitional latches. These are almost always # exercised in tests and never in practice. # # TODO(wickman) ConnectionLoss probably does not encapsulate all the # exception states that arise on connection loss and/or session # expiration. However, we don't want to blanket catch ZooKeeperException # because some things e.g. get() will raise NoNodeException. We should # partition the exception space in two: behavioral exceptions and, well, # exceptional exceptions. class BlockingCompletion(Completion): def __call__(self): while True: try: self._logger('%s start' % self) result = self._fn(self._zk._zh) self._logger('%s success' % self) return result except (zookeeper.ConnectionLossException, zookeeper.InvalidStateException, TypeError) as e: # TypeError because we raced on live latch from True=>False when _zh gets reinitialized. if isinstance(e, TypeError) and self._zk._zh is not None: self._logger('%s excepted, user error' % self) raise # We had the misfortune of the live latch being set but having a session event propagate # before the BlockingCompletion could be executed. while not self._zk._stopped.is_set(): self._logger('%s [live: %s] excepted on connection event: %s' % ( self, self._zk._live.is_set(), e)) self._zk._live.wait(timeout=0.1) if self._zk._live.is_set(): break if self._zk._stopped.is_set(): raise ZooKeeper.Stopped('ZooKeeper is stopped.') except Exception as e: self._logger('%s excepted unexpectedly: %s' % (self, e)) raise def __init__(self, servers=None, timeout_secs=None, watch=None, max_reconnects=None, authentication=None, logger=log.debug): """Create new ZooKeeper object. Blocks until ZK negotation completes, or the timeout expires. By default only tries to connect once. Use a larger 'max_reconnects' if you want to be resilient to things such as DNS outages/changes. If watch is set to a function, it is called whenever the global zookeeper watch is dispatched using the same function signature, with the exception that this object is used in place of the zookeeper handle. If authentication is set, it should be a tuple of (scheme, credentials), for example, ('digest', 'username:password') """ default_ensemble = self.DEFAULT_ENSEMBLE default_timeout = self.DEFAULT_TIMEOUT_SECONDS default_reconnects = self.MAX_RECONNECTS if WITH_APP: options = app.get_options() default_ensemble = options.twitter_common_zookeeper_ensemble default_timeout = options.twitter_common_zookeeper_timeout default_reconnects = options.twitter_common_zookeeper_reconnects self._servers = servers or default_ensemble self._timeout_secs = timeout_secs or default_timeout self._init_count = 0 self._credentials = authentication self._authenticated = threading.Event() self._live = threading.Event() self._stopped = threading.Event() self._completions = Queue() self._zh = None self._watch = watch self._logger = logger self._max_reconnects = max_reconnects if max_reconnects is not None else default_reconnects self._init_stats() self.reconnect() def __del__(self): self._safe_close() def _log(self, msg): self._logger('[zh:%s] %s' % (self._zh, msg)) def _init_stats(self): self._gauge_session_expirations = AtomicGauge('session-expirations') self._gauge_connection_losses = AtomicGauge('connection-losses') def session_id(self): try: session_id, _ = zookeeper.client_id(self._zh) return session_id except: return None @property def session_expirations(self): return self._gauge_session_expirations.read() @property def connection_losses(self): return self._gauge_connection_losses.read() @property def live(self): return self._live.is_set() def stop(self): """Gracefully stop this Zookeeper session.""" self._log('Shutting down ZooKeeper') self._stopped.set() self._safe_close() self._completions = Queue() # there is no .clear() def restart(self): """Stop and restart this Zookeeper session. Unfinished completions will be retried on reconnection.""" self._safe_close() self._stopped.clear() self.reconnect() def _safe_close(self): if self._zh is not None: zh, self._zh = self._zh, None try: zookeeper.close(zh) except zookeeper.ZooKeeperException: # the session has been corrupted or otherwise disconnected pass self._live.clear() def _add_completion(self, function, *args, **kw): self._completions.put(self.Completion(self, function, logger=self._log, *args, **kw)) def _clear_completions(self): while self._live.is_set(): try: completion = self._completions.get_nowait() completion() self._completions.task_done() except Empty: return def reconnect(self): """Attempt to reconnect to ZK.""" if self._stopped.is_set(): self._safe_close() return def safe_close(zh): try: zookeeper.close(zh) except: # TODO(wickman) When the SystemError bug is fixed in zkpython, narrow this except clause. pass def activate(): self._authenticated.set() self._live.set() def on_authentication(zh, rc): if self._zh != zh: safe_close(zh) return if rc == zookeeper.OK: activate() def maybe_authenticate(): if self._authenticated.is_set() or not self._credentials: activate() return try: scheme, credentials = self._credentials zookeeper.add_auth(self._zh, scheme, credentials, on_authentication) except zookeeper.ZooKeeperException as e: self._logger('Failed to authenticate: %s' % e) def connection_handler(handle, type, state, path): if self._zh != handle: safe_close(handle) return if self._stopped.is_set(): return if self._watch: self._watch(self, type, state, path) if state == zookeeper.CONNECTED_STATE: self._logger('Connection started, setting live.') maybe_authenticate() self._clear_completions() elif state == zookeeper.EXPIRED_SESSION_STATE: self._logger('Session lost, clearing live state.') self._gauge_session_expirations.increment() self._live.clear() self._authenticated.clear() self._zh = None self._init_count = 0 self.reconnect() else: self._logger('Connection lost, clearing live state.') self._gauge_connection_losses.increment() self._live.clear() # this closure is exposed for testing only -- in order to simulate session events. self._handler = connection_handler timeout_ms = int(self._timeout_secs * 1000) while True: self._safe_close() servers = self.expand_ensemble(self._servers) self._log('Connecting to ZK hosts at %s' % servers) self._zh = zookeeper.init(servers, connection_handler, timeout_ms) self._init_count += 1 self._live.wait(self._timeout_secs + 1) if self._live.is_set(): break elif self._max_reconnects > 0 and self._init_count >= self._max_reconnects: self._safe_close() raise ZooKeeper.ConnectionTimeout('Timed out waiting for ZK connection to %s' % servers) self._log('Successfully connected to ZK at %s' % servers) def _wrap_sync(self, function_name): """Wrap a zookeeper module function in an error-handling completion that injects the current zookeeper handle as the first parameter.""" function = getattr(zookeeper, function_name) @wraps(function) def _curry(*args, **kwargs): return self.BlockingCompletion(self, function, logger=self._log, *args, **kwargs)() return _curry def _wrap_async(self, function_name): """Wrap an asynchronous zookeeper module function in an error-handling completion that injects the current zookeeper handle as the first parameter and puts it on a completion queue if the current connection state is unhealthy.""" function = getattr(zookeeper, function_name) @wraps(function) def _curry(*args, **kwargs): completion = self.Completion(self, function, logger=self._log, *args, **kwargs) if self._live.is_set(): return completion() else: # TODO(wickman) This is racy, should it go from not live => live # prior to Queue.put. Two solutions: a periodic background thread # that attempts to empty the completion queue, or use a mutex-protected # container for self._live. self._completions.put(self.Completion(self, function, logger=self._log, *args, **kwargs)) return zookeeper.OK # proxy OK. return _curry def safe_create(self, path, acl=DEFAULT_ACL): child = '/' for component in filter(None, path.split('/')): child = posixpath.join(child, component) try: self.create(child, "", acl, 0) except zookeeper.NodeExistsException: continue except zookeeper.NoAuthException: if not self.exists(child): raise return child def safe_delete(self, path): try: if not self.exists(path): return True for child in self.get_children(path): if not self.safe_delete(posixpath.join(path, child)): return False self.delete(path) except zookeeper.ZooKeeperException: return False return True def __getattr__(self, function_name): """Proxy to underlying ZK functions.""" if function_name in ZooKeeper._ZK_SYNC_METHODS: return self._wrap_sync(function_name) elif function_name in ZooKeeper._ZK_ASYNC_METHODS: return self._wrap_async(function_name) else: raise AttributeError('%r has no attribute %r' % (self, function_name)) def __str__(self): return 'ZooKeeper(status=%s,queued=%d,servers=%r)' % ( self.STATUS_MATRIX[(self._live.is_set(), self._stopped.is_set())], self._completions.qsize(), self._servers) def __repr__(self): return 'ZooKeeper(servers=%r)' % self._servers
class ZooKeeper(Observable): """A convenience wrapper around the low-level ZooKeeper API. Blocks until the initial connection is established, and proxies method calls to the corresponding ZK functions, passing the handle. Supports both synchronous and asynchronous APIs. Syncronous API Notes: Synchronous calls will block across connection loss or session expiration until reconnected. Asynchronous API Notes: Asynchronous calls will queue up while the session/connection is unhealthy and only be dispatched while zookeeper is healthy. It is still possible for asynchronous calls to fail should the session be severed after the call has been successfully dispatched. In other words: don't assume your rc will always be zookeeper.OK. Watches will behave as normal assuming successful dispatch. In general when using this wrapper, you should retry your call if your watch is fired with EXPIRED_SESSION_STATE and ignore anything else whose state is not CONNECTED_STATE. This wrapper will never re-dispatch calls that have been sent to zookeeper without error. """ class Error(Exception): pass class ConnectionTimeout(Error): pass class InvalidEnsemble(Error): pass class Stopped(Error): pass # White-list of methods that accept a ZK handle as their first argument _ZK_SYNC_METHODS = frozenset([ 'add_auth', 'close', 'create', 'delete', 'exists', 'get', 'get_acl', 'get_children', 'is_unrecoverable', 'recv_timeout', 'set', 'set2', 'set_acl', 'set_watcher', 'state', ]) _ZK_ASYNC_METHODS = frozenset([ 'acreate', 'adelete', 'aexists', 'aget', 'aget_acl', 'aget_children', 'aset', 'aset_acl', 'async' ]) COMPLETION_RETRY = frozenset([ zookeeper.CONNECTIONLOSS, zookeeper.OPERATIONTIMEOUT, zookeeper.SESSIONEXPIRED, zookeeper.CLOSING, ]) @classmethod def expand_ensemble(cls, servers): """Expand comma-separated list of host:port to comma-separated, fully-resolved list of ip:port.""" server_ports = [] for server_port in servers.split(','): server_split = server_port.split(':', 2) if len(server_split) == 1: server, port = server_split[0], cls.DEFAULT_PORT else: try: server, port = server_split[0], int(server_split[1]) except ValueError: raise cls.InvalidEnsemble('Invalid ensemble string: %s' % server_port) try: for ip in socket.gethostbyname_ex(server)[2]: server_ports.append('%s:%s' % (ip, port)) except socket.gaierror: raise cls.InvalidEnsemble('Could not resolve %s' % server) return ','.join(server_ports) DEFAULT_TIMEOUT_SECONDS = 30.0 DEFAULT_ENSEMBLE = 'localhost:2181' DEFAULT_PORT = 2181 DEFAULT_ACL = ZooDefs.Acls.OPEN_ACL_UNSAFE MAX_RECONNECTS = 1 # (is live?, is stopped?) => human readable status STATUS_MATRIX = { (True, True): 'WTF', (True, False): 'OK', (False, True): 'STOPPED', (False, False): 'CONNECTING' } class Completion(object): def __init__(self, zk, function, *args, **kw): self._zk = zk self._cid = random.randint(0, sys.maxint - 1) self._logger = kw.pop('logger', log.debug) @wraps(function) def wrapper(zh): return function(zh, *args, **kw) self._fn = wrapper self._logger('Created %s args:(%s) kw:{%s}' % (self, ', '.join( map(repr, args)), ', '.join('%s: %r' % (key, val) for key, val in kw.items()))) def __str__(self): return '%s(id:%s, zh:%s, %s)' % (self.__class__.__name__, self._cid, self._zk._zh, self._fn.__name__) def __call__(self): try: self._logger('%s start' % self) result = self._fn(self._zk._zh) self._logger('%s success' % self) return result except TypeError as e: # Raced; zh now dead, so re-enqueue. if self._zk._zh is not None: raise self._logger('%s raced, re-enqueueing' % self) self._zk._add_completion(self._fn) except (zookeeper.ConnectionLossException, zookeeper.InvalidStateException, zookeeper.SessionExpiredException, SystemError) as e: self._logger('%s excepted (%s), re-enqueueing' % (self, e)) self._zk._add_completion(self._fn) return zookeeper.OK # N.B.(wickman) This is code is theoretically racy. We cannot synchronize # events across the zookeeper C event loop, however we do everything in # our power to catch transitional latches. These are almost always # exercised in tests and never in practice. # # TODO(wickman) ConnectionLoss probably does not encapsulate all the # exception states that arise on connection loss and/or session # expiration. However, we don't want to blanket catch ZooKeeperException # because some things e.g. get() will raise NoNodeException. We should # partition the exception space in two: behavioral exceptions and, well, # exceptional exceptions. class BlockingCompletion(Completion): def __call__(self): while True: try: self._logger('%s start' % self) result = self._fn(self._zk._zh) self._logger('%s success' % self) return result except (zookeeper.ConnectionLossException, zookeeper.InvalidStateException, zookeeper.SessionExpiredException, TypeError) as e: # TypeError because we raced on live latch from True=>False when _zh gets reinitialized. if isinstance(e, TypeError) and self._zk._zh is not None: self._logger('%s excepted, user error' % self) raise # We had the misfortune of the live latch being set but having a session event propagate # before the BlockingCompletion could be executed. while not self._zk._stopped.is_set(): self._logger( '%s [live: %s] excepted on connection event: %s' % (self, self._zk._live.is_set(), e)) self._zk._live.wait(timeout=0.1) if self._zk._live.is_set(): break if self._zk._stopped.is_set(): raise ZooKeeper.Stopped('ZooKeeper is stopped.') except Exception as e: self._logger('%s excepted unexpectedly: %s' % (self, e)) raise def __init__(self, servers=None, timeout_secs=None, watch=None, max_reconnects=None, authentication=None, logger=log.debug): """Create new ZooKeeper object. Blocks until ZK negotation completes, or the timeout expires. By default only tries to connect once. Use a larger 'max_reconnects' if you want to be resilient to things such as DNS outages/changes. If watch is set to a function, it is called whenever the global zookeeper watch is dispatched using the same function signature, with the exception that this object is used in place of the zookeeper handle. If authentication is set, it should be a tuple of (scheme, credentials), for example, ('digest', 'username:password') """ default_ensemble = self.DEFAULT_ENSEMBLE default_timeout = self.DEFAULT_TIMEOUT_SECONDS default_reconnects = self.MAX_RECONNECTS if WITH_APP: options = app.get_options() default_ensemble = options.twitter_common_zookeeper_ensemble default_timeout = options.twitter_common_zookeeper_timeout default_reconnects = options.twitter_common_zookeeper_reconnects self._servers = servers or default_ensemble self._timeout_secs = timeout_secs or default_timeout self._init_count = 0 self._credentials = authentication self._authenticated = threading.Event() self._live = threading.Event() self._stopped = threading.Event() self._completions = Queue() self._zh = None self._watch = watch self._logger = logger self._max_reconnects = max_reconnects if max_reconnects is not None else default_reconnects self._init_metrics() self.reconnect() def __del__(self): self._safe_close() def _log(self, msg): self._logger('[zh:%s] %s' % (self._zh, msg)) def _init_metrics(self): self._session_expirations = AtomicGauge('session_expirations') self._connection_losses = AtomicGauge('connection_losses') self.metrics.register(self._session_expirations) self.metrics.register(self._connection_losses) self.metrics.register( LambdaGauge('session_id', lambda: self.session_id)) self.metrics.register( LambdaGauge('live', lambda: int(self._live.is_set()))) @property def session_id(self): try: session_id, _ = zookeeper.client_id(self._zh) return session_id except: return None @property def session_expirations(self): return self._session_expirations.read() @property def connection_losses(self): return self._connection_losses.read() @property def live(self): return self._live def stop(self): """Gracefully stop this Zookeeper session.""" self._log('Shutting down ZooKeeper') self._stopped.set() self._safe_close() self._completions = Queue() # there is no .clear() def restart(self): """Stop and restart this Zookeeper session. Unfinished completions will be retried on reconnection.""" self._safe_close() self._stopped.clear() self.reconnect() def _safe_close(self): if self._zh is not None: zh, self._zh = self._zh, None try: zookeeper.close(zh) except zookeeper.ZooKeeperException: # the session has been corrupted or otherwise disconnected pass self._live.clear() def _add_completion(self, function, *args, **kw): self._completions.put( self.Completion(self, function, logger=self._log, *args, **kw)) def _clear_completions(self): while self._live.is_set(): try: completion = self._completions.get_nowait() completion() self._completions.task_done() except Empty: return def reconnect(self): """Attempt to reconnect to ZK.""" if self._stopped.is_set(): self._safe_close() return def safe_close(zh): try: zookeeper.close(zh) except: # TODO(wickman) When the SystemError bug is fixed in zkpython, narrow this except clause. pass def activate(): self._authenticated.set() self._live.set() def on_authentication(zh, rc): if self._zh != zh: safe_close(zh) return if rc == zookeeper.OK: activate() def maybe_authenticate(): if self._authenticated.is_set() or not self._credentials: activate() return try: scheme, credentials = self._credentials zookeeper.add_auth(self._zh, scheme, credentials, on_authentication) except zookeeper.ZooKeeperException as e: self._logger('Failed to authenticate: %s' % e) def connection_handler(handle, type, state, path): if self._zh != handle: safe_close(handle) return if self._stopped.is_set(): return if self._watch: self._watch(self, type, state, path) if state == zookeeper.CONNECTED_STATE: self._logger('Connection started, setting live.') maybe_authenticate() self._clear_completions() elif state == zookeeper.EXPIRED_SESSION_STATE: self._logger('Session lost, clearing live state.') self._session_expirations.increment() self._live.clear() self._authenticated.clear() self._zh = None self._init_count = 0 self.reconnect() else: self._logger('Connection lost, clearing live state.') self._connection_losses.increment() self._live.clear() # this closure is exposed for testing only -- in order to simulate session events. self._handler = connection_handler timeout_ms = int(self._timeout_secs * 1000) while True: self._safe_close() servers = self.expand_ensemble(self._servers) self._log('Connecting to ZK hosts at %s' % servers) self._zh = zookeeper.init(servers, connection_handler, timeout_ms) self._init_count += 1 self._live.wait(self._timeout_secs + 1) if self._live.is_set(): break elif self._max_reconnects > 0 and self._init_count >= self._max_reconnects: self._safe_close() raise ZooKeeper.ConnectionTimeout( 'Timed out waiting for ZK connection to %s' % servers) self._log('Successfully connected to ZK at %s' % servers) def _wrap_sync(self, function_name): """Wrap a zookeeper module function in an error-handling completion that injects the current zookeeper handle as the first parameter.""" function = getattr(zookeeper, function_name) @wraps(function) def _curry(*args, **kwargs): return self.BlockingCompletion(self, function, logger=self._log, *args, **kwargs)() return _curry def _wrap_async(self, function_name): """Wrap an asynchronous zookeeper module function in an error-handling completion that injects the current zookeeper handle as the first parameter and puts it on a completion queue if the current connection state is unhealthy.""" function = getattr(zookeeper, function_name) @wraps(function) def _curry(*args, **kwargs): completion = self.Completion(self, function, logger=self._log, *args, **kwargs) if self._live.is_set(): return completion() else: # TODO(wickman) This is racy, should it go from not live => live # prior to Queue.put. Two solutions: a periodic background thread # that attempts to empty the completion queue, or use a mutex-protected # container for self._live. self._completions.put( self.Completion(self, function, logger=self._log, *args, **kwargs)) return zookeeper.OK # proxy OK. return _curry def safe_create(self, path, acl=DEFAULT_ACL): child = '/' for component in filter(None, path.split('/')): child = posixpath.join(child, component) try: self.create(child, "", acl, 0) except zookeeper.NodeExistsException: continue except zookeeper.NoAuthException: if not self.exists(child): raise return child def safe_delete(self, path): try: if not self.exists(path): return True for child in self.get_children(path): if not self.safe_delete(posixpath.join(path, child)): return False self.delete(path) except zookeeper.ZooKeeperException: return False return True def __getattr__(self, function_name): """Proxy to underlying ZK functions.""" if function_name in ZooKeeper._ZK_SYNC_METHODS: return self._wrap_sync(function_name) elif function_name in ZooKeeper._ZK_ASYNC_METHODS: return self._wrap_async(function_name) else: raise AttributeError('%r has no attribute %r' % (self, function_name)) def __str__(self): return 'ZooKeeper(status=%s,queued=%d,servers=%r)' % ( self.STATUS_MATRIX[(self._live.is_set(), self._stopped.is_set())], self._completions.qsize(), self._servers) def __repr__(self): return 'ZooKeeper(servers=%r)' % self._servers