class Exhibitor: def __init__(self, exhibitor, chroot): self.chroot = chroot self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=30) self.client = KazooClient(hosts=self.exhibitor.zookeeper_hosts + self.chroot, command_retry={ 'deadline': 10, 'max_delay': 1, 'max_tries': -1}, connection_retry={'max_delay': 1, 'max_tries': -1}) self.client.add_listener(self.session_listener) self.client.start() def session_listener(self, state): pass def _poll_exhibitor(self): if self.exhibitor.poll(): self.client.set_hosts(self.exhibitor.zookeeper_hosts + self.chroot) def get(self, *params): self._poll_exhibitor() return self.client.retry(self.client.get, *params) def get_children(self, *params): self._poll_exhibitor() try: return self.client.retry(self.client.get_children, *params) except NoNodeError: return []
def _zk_set_flag(zk: KazooClient, ephemeral: bool = False) -> str: """ Store the `FLAG` value in ZooKeeper in a random Znode. """ znode = '/{}'.format(uuid.uuid4()) zk.retry(zk.create, znode, makepath=True, ephemeral=ephemeral) zk.retry(zk.set, znode, FLAG) return znode
class ZooKeeper: """Helper ZooKeeper function that handles connection and node updates""" @Retry(exception_list=[ ConnectionLoss, SessionExpiredError, KazooTimeoutError ]) def __init__(self) -> None: hosts = settings.ZOO_HOSTS retry = KazooRetry(max_tries=-1, max_delay=60) self._zk = KazooClient(hosts, connection_retry=retry, command_retry=retry) # establish the connection self._zk.start() def _set_node(self, path: str, value: Optional[Any] = None, ephemeral: bool = False) -> None: try: self._zk.retry(self._zk.set, path=path, value=pickle.dumps(value) or None) except NoNodeError: self._create_node(path, value, ephemeral) def _get_node(self, path: str) -> Any: # NoNodeError needs to be handled differently, so we dont handle it here value, *_ = self._zk.retry(self._zk.get, path=path, watch=False) return pickle.loads(value) def _get_children(self, path: str) -> Any: # NoNodeError needs to be handled differently, so we dont handle it here value = self._zk.retry(self._zk.get_children, path=path) return value def _delete_node(self, path: str, recursive: bool = True) -> bool: try: self._zk.retry(self._zk.delete, path=path, recursive=recursive) return True except NotEmptyError: return False def _create_node(self, path: str, value: Optional[Any] = None, ephemeral: bool = False) -> bool: try: self._zk.retry(self._zk.create, path=path, ephemeral=ephemeral, value=pickle.dumps(value) or None, makepath=True) return True except NodeExistsError: return False
class Root(Record): def __init__(self, root_path): if not root_path.startswith("/"): root_path = "/" + root_path super(Root, self).__init__(None, root_path) self.lock = Lock() self.ZK_retry = KazooRetry(max_tries=-1) self.ZK = None def connect(self, zookeeper_hosts): self.ZK = KazooClient( zookeeper_hosts, connection_retry=self.ZK_retry, command_retry=self.ZK_retry) self.ZK.start() # create & load collections self.clusters = Collection(self, "clusters", Cluster) self.meta = Meta(self, "meta") return self.load() def load(self): super(Root, self).load() self.clusters.load() self.meta.load() return self def zk_ensure_path(self, *args, **kwargs): return self.ZK.retry(self.ZK.ensure_path, *args, **kwargs) def zk_set(self, *args, **kwargs): return self.ZK.retry(self.ZK.set, *args, **kwargs) def zk_get(self, *args, **kwargs): return self.ZK.retry(self.ZK.get, *args, **kwargs) def zk_get_children(self, *args, **kwargs): return self.ZK.retry(self.ZK.get_children, *args, **kwargs) def zk_delete(self, *args, **kwargs): return self.ZK.retry(self.ZK.delete, *args, **kwargs)
def _zk_flag_exists(zk: KazooClient, znode: str) -> bool: """ The `FLAG` value exists in ZooKeeper at `znode` path. """ try: value = zk.retry(zk.get, znode) except NoNodeError: return False return bool(value[0] == FLAG)
class TestSessions(unittest.TestCase): def setUp(self): from kazoo.client import KazooClient from kazoo.protocol.states import KazooState from kazoo.testing.common import ZookeeperCluster ZK_HOME = os.environ.get("ZOOKEEPER_PATH") ZK_CLASSPATH = os.environ.get("ZOOKEEPER_CLASSPATH") self.cluster = ZookeeperCluster(ZK_HOME, size=1, port_offset=21000, classpath=ZK_CLASSPATH) self.cluster.start() atexit.register(lambda cluster: self.cluster.terminate(), self.cluster) self.client = KazooClient(self.cluster[0].address, max_retries=5) self.ev = threading.Event() def back(state): if state == KazooState.CONNECTED: self.ev.set() self.client.start() self.path = self.client.create(uuid.uuid4().hex) self.client.add_listener(back) def test_restarted_server(self): raise SkipTest('Patch missing') self.cluster.stop() self.cluster.start() self.ev.wait(5) eq_(self.ev.is_set(), True) self.assertTrue(self.client.retry(self.client.exists, self.path)) def test_terminated_server(self): raise SkipTest('Patch missing') self.cluster.reset() self.cluster.start() self.ev.wait(5) eq_(self.ev.is_set(), True) self.assertFalse(self.client.retry(self.client.exists, self.path)) def tearDown(self): self.ev.clear() self.client.stop() self.cluster.stop()
class ZkCacheTaskManager(object): RETRIES = 2 def __init__(self, host='127.0.0.1:2181', lock_path_prefix='/mastermind/cache/'): self.client = KazooClient(host, timeout=3) logger.info( 'Connecting to zookeeper host {}, lock_path_prefix: {}'.format( host, lock_path_prefix)) try: self.client.start() except Exception as e: logger.error(e) raise self.lock_path_prefix = lock_path_prefix def put_task(self, task): group_id = task['group'] q = LockingQueue(self.client, self.lock_path_prefix, group_id) return q.put(self._serialize(task)) def put_all(self, tasks): for task in tasks: self.put_task(task) def list(self): for group_id in self.client.retry(self.client.get_children, self.lock_path_prefix): for item in LockingQueue(self.client, self.lock_path_prefix, group_id).list(): yield self._unserialize(item) @staticmethod def _serialize(task): return msgpack.packb(task) @staticmethod def _unserialize(task): return msgpack.unpackb(task)
class ZkCacheTaskManager(object): RETRIES = 2 def __init__(self, host='127.0.0.1:2181', lock_path_prefix='/mastermind/cache/'): self.client = KazooClient(host, timeout=3) logger.info('Connecting to zookeeper host {}, lock_path_prefix: {}'.format( host, lock_path_prefix)) try: self.client.start() except Exception as e: logger.error(e) raise self.lock_path_prefix = lock_path_prefix def put_task(self, task): group_id = task['group'] q = LockingQueue(self.client, self.lock_path_prefix, group_id) return q.put(self._serialize(task)) def put_all(self, tasks): for task in tasks: self.put_task(task) def list(self): for group_id in self.client.retry(self.client.get_children, self.lock_path_prefix): for item in LockingQueue(self.client, self.lock_path_prefix, group_id).list(): yield self._unserialize(item) @staticmethod def _serialize(task): return msgpack.packb(task) @staticmethod def _unserialize(task): return msgpack.unpackb(task)
def main(args): zk = KazooClient(hosts=zk_host) zk.start() register_emergency_signal_handler(zk) try: zk.retry(zk.create(path=test_znode, makepath=True)) except NodeExistsError: logging.info('{} already exists, no need to create'.format(test_znode)) start_command = 'start({},{})'.format(args.message_size, args.topics) zk.retry(lambda: zk.set(test_znode, start_command)) t_start = time.time() # in seconds t_end = t_start + args.duration while(time.time() < t_end): time.sleep(t_end - time.time()) # shouldn't introduce error larger than 10-15 ms zk.retry(lambda: zk.set(test_znode, 'stop')) zk.stop()
class ServiceRegister(object): def __init__(self, hosts="127.0.0.1:2181", read_only=True, logger=None): """ 服务注册 :param hosts: Zookeeper集群地址列表 :param read_only: 是否只读 :param logger: 日志对象 """ if not logger: import logging logging.basicConfig() self._zk = KazooClient(hosts, read_only=read_only, logger=logger) self._zk.start() def restart(self): self._zk.restart() def retry_get(self, path, watcher=None): """ 重读 :param path: 节点路由 :param watcher: 观察者回调函数 :return: 成功:节点值,版本号;失败:异常信息,异常代码。 """ return self._zk.retry(self.get, path, watcher) def lock(self, path, identifier, timeout=None): """ 分布式锁 :param path: 路由 :param identifier: 锁标识 :param timeout: 超时时间 :return: 锁对象 """ return DLock(self._zk, path, identifier, timeout) def exist(self, path): """ 节点是否存在 :param path: 路由 :return: 存在返回True,不存在返回False。 """ state = self._zk.exists(path) return state is not None def create(self, path, value=""): """ 创建节点 :param path: 节点路由 :param value: 节点值 :return: 节点路由 """ try: res_path = self._zk.create(path, value, makepath=True) except NodeExistsError: return path except NoNodeError as e: return e.message except ZookeeperError as e: return e.message else: return res_path def get(self, path, watcher=None): """ 查节点值 :param path: 节点路由 :param watcher: 观察者回调函数 :return: 成功:节点值,版本号;失败:异常信息,异常代码。 """ try: data, state = self._zk.get(path) self._zk.DataWatch(path, watcher) except NoNodeError as e: return e.message, -2 except ZookeeperError as e: return e.message, -3 else: return data, state.version def get_children(self, path, watcher=None): """ 查子节点列表 :param path: 节点路由 :param watcher: 观察者回调函数 :return: 子节点列表 """ try: data = self._zk.get_children(path) self._zk.DataWatch(path, watcher) except NoNodeError as e: return [], -2 except ZookeeperError as e: return [], -3 else: return data, 0 def set(self, path, value, version=-1): """ 改节点值 :param path: 节点路由 :param value: 节点值 :param version: 成功:版本号;失败:异常信息。 """ try: state = self._zk.set(path, value, version) except BadVersionError as e: return e.message except NoNodeError as e: return e.message except ZookeeperError as e: return e.message else: return state.version
# -*- coding: utf-8 -*- import time import glob import sys import os from kazoo.client import KazooClient import logging logging.basicConfig() if __name__ == "__main__": print("begin.") zk = KazooClient(hosts='172.10.3.111:2181') zk.start() mypath = "/my/favorite" result = zk.retry(zk.get, mypath) print("result", result) zk.stop() print("exit.")
class ZooKeeper(AbstractDCS): def __init__(self, name, config): super(ZooKeeper, self).__init__(name, config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) self.exhibitor = None if 'exhibitor' in config: exhibitor = config['exhibitor'] interval = exhibitor.get('poll_interval', 300) self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=interval) hosts = self.exhibitor.zookeeper_hosts self.client = KazooClient(hosts=hosts, timeout=(config.get('session_timeout', None) or 30), command_retry={ 'deadline': (config.get('reconnect_timeout', None) or 10), 'max_delay': 1, 'max_tries': -1}, connection_retry={'max_delay': 1, 'max_tries': -1}) self.client.add_listener(self.session_listener) self.cluster_event = self.client.handler.event_object() self.fetch_cluster = True self.members = [] self.leader = None self.last_leader_operation = 0 self.client.start(None) def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def cluster_watcher(self, event): self.fetch_cluster = True self.cluster_event.set() def get_node(self, name, watch=None): try: return self.client.get(self.client_path(name), watch) except NoNodeError: pass except: logger.exception('get_node') return None @staticmethod def member(name, value, znode): conn_url, api_url = parse_connection_string(value) return Member(znode.mzxid, name, conn_url, api_url, None, None) def load_members(self): members = [] for member in self.client.get_children(self.client_path('/members'), self.cluster_watcher): data = self.get_node('/members/' + member) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self.cluster_event.clear() leader = self.get_node('/leader', self.cluster_watcher) self.members = self.load_members() if leader: if leader[0] == self._name: client_id = self.client.client_id if client_id is not None and client_id[0] != leader[1].ephemeralOwner: logger.info('I am leader but not owner of the session. Removing leader node') self.client.delete(self.client_path('/leader')) leader = None if leader: for member in self.members: if member.name == leader[0]: leader = member self.fetch_cluster = False break if not isinstance(leader, Member): leader = Member(-1, leader, None, None, None, None) self.leader = leader if self.fetch_cluster: last_leader_operation = self.get_node('/optime/leader') if last_leader_operation: self.last_leader_operation = int(last_leader_operation[0]) def get_cluster(self): if self.exhibitor and self.exhibitor.poll(): self.client.set_hosts(self.exhibitor.zookeeper_hosts) if self.fetch_cluster: try: self.client.retry(self._inner_load_cluster) except: logger.exception('get_cluster') self.session_listener(KazooState.LOST) raise ZooKeeperError('ZooKeeper in not responding properly') return Cluster(True, self.leader, self.last_leader_operation, self.members) def _create(self, path, value, **kwargs): try: self.client.retry(self.client.create, self.client_path(path), value, **kwargs) return True except: return False def attempt_to_acquire_leader(self): ret = self._create('/leader', self._name, makepath=True, ephemeral=True) ret or logger.info('Could not take out TTL lock') return ret def race(self, path): return self._create(path, self._name, makepath=True) def touch_member(self, connection_string, ttl=None): for m in self.members: if m.name == self._name: return True path = self.client_path('/members/' + self._name) try: self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True) return True except NodeExistsError: try: self.client.retry(self.client.delete, path) self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True) return True except: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def update_leader(self, state_handler): last_operation = state_handler.last_operation() if last_operation != self.last_leader_operation: self.last_leader_operation = last_operation path = self.client_path('/optime/leader') try: self.client.retry(self.client.set, path, last_operation) except NoNodeError: try: self.client.retry(self.client.create, path, last_operation, makepath=True) except: logger.exception('Failed to create %s', path) except: logger.exception('Failed to update %s', path) return True def delete_leader(self): if isinstance(self.leader, Member) and self.leader.name == self._name: self.client.delete(self.client_path('/leader')) def sleep(self, timeout): self.cluster_event.wait(timeout) if self.cluster_event.isSet(): self.fetch_cluster = True
class SolrCloudManager: def __init__(self, zk_host): self.__zk = KazooClient(hosts=zk_host) self.__zk.start() def __del__(self): self.__zk.stop() def get_cluster_state(self): cs_tuple = self.__zk.retry(self.__zk.get, 'clusterstate.json') cs = json.loads(cs_tuple[0]) return cs # Check all replicas that contain node_name # Return true if ALL nodes are in the active state def replicas_are_active(self, node_name): cluster_state = self.get_cluster_state() active = True for cn, cdata in cluster_state.iteritems(): for sn, sdata in cdata['shards'].iteritems(): replica_down = False node_in_replica = False for rn, rdata in sdata['replicas'].iteritems(): if rdata['node_name'] == node_name: node_in_replica = True if rdata['state'] != "active": replica_down = True if replica_down and node_in_replica: active = False if not active: break return active # Wait for all replicas to enter the active state def wait_for_replicas(self, node_name, timeout): start_time = time.time() ra = self.replicas_are_active(node_name) while ((start_time + timeout) > time.time()) and (not ra): print "Waiting for replication to finish" time.sleep(3) ra = self.replicas_are_active(node_name) return ra def node_is_live(self, node_name): live_nodes = self.__zk.retry(self.__zk.get_children, 'live_nodes') return (node_name in live_nodes) def wait_for_live_node(self, node_name, timeout): start_time = time.time() lv = self.node_is_live(node_name) while ((start_time + timeout) > time.time()) and (not lv): print "Waiting for live node" time.sleep(3) lv = self.node_is_live(node_name) return lv def _remove_live_node(self, node_name): print(green('Deleting: live_nodes/%s' % (node_name))) self.__zk.retry(self.__zk.delete, 'live_nodes/' + node_name) return True def _restart_host_solr_service(self, host): print(green('Restarting: %s' % (host))) result = sudo("restart solr-undertow") if result.failed: print(red('Failed to restart: %s' % (host))) return False return True def restart_host_solr(self, host, host_port='8983', force=False, ln_timeout=240, rn_timeout=600): if host is None: return self._return_message(1, 'host is required') node_name = host + ':' + host_port + '_solr' if (not force) and (not self.node_is_live(node_name)): return self._return_message(10, 'Node is not live') # Don't restart if any other replicas are down if (not force) and (not self.replicas_are_active(node_name)): return self._return_message(20, 'Not all replicas are active') # LATER Make sure a reindex isn't in progress if not self._remove_live_node(node_name): return self._return_message(30, 'Error removing live node') if not self._restart_host_solr_service(host): return self._return_message(40, 'Error restarting solr service') if not self.wait_for_live_node(node_name, ln_timeout): return self._return_message(50, 'Timeout waiting for live node') if not self.wait_for_replicas(node_name, rn_timeout): return self._return_message(60, 'Timeout waiting for replicas') def _return_message(self, error_code, message): print(red({'status': error_code, 'message': message})) sys.exit(error_code)
class ZookeeperServiceRegistry(BaseServiceRegistry): def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT): super(ZookeeperServiceRegistry, self).__init__() self.chroot = chroot self.client = KazooClient( hosts=hosts, handler=SequentialGeventHandler(), ) self.client.add_listener(self.on_kazoo_state_change) self.start_count = 0 @classmethod def from_config(cls, config, **kwargs): return cls(hosts=config.get('hosts', DEFAULT_HOSTS), chroot=config.get('chroot', DEFAULT_CHROOT), **kwargs) def on_start(self, timeout=10): self.start_count += 1 if self.start_count > 1: return started = self.client.start_async() started.wait(timeout=timeout) if not self.client.connected: raise RuntimeError('could not connect to zookeeper') logger.debug('connected to zookeeper (version=%s)', '.'.join(map(str, self.client.server_version()))) def on_stop(self): self.start_count -= 1 if self.start_count != 0: return self.client.stop() def on_kazoo_state_change(self, state): logger.info('kazoo connection state changed to %s', state) def on_service_type_watch(self, service, event): try: if event.type == EventType.CHILD: # FIXME: figure out proper retry strategy self.client.retry(self.lookup, service.container, service) except Exception: logger.exception('error in service type watcher') def on_service_watch(self, service, event): try: prefix, service_type, identity = event.path.rsplit('/', 2) if event.type == EventType.DELETED: service.remove(identity) except Exception: logger.exception('error in service watcher') def _get_service_znode(self, service, service_type, identity): path = self._get_zk_path(service_type, identity) result = self.client.get_async(path, watch=functools.partial( self.on_service_watch, service)) value, znode = result.get() items = six.iteritems(json.loads(value.decode('utf-8'))) return {str(k): str(v) for k, v in items} def discover(self, container): result = self.client.get_children_async(path='%s/services' % self.chroot, ) return list(result.get()) def lookup(self, container, service, watch=True, timeout=1): def child_watch(event): print(event) service_type = service.service_type result = self.client.get_children_async( path='%s/services/%s' % (self.chroot, service_type), watch=functools.partial(self.on_service_type_watch, service), ) try: names = result.get(timeout=timeout) except NoNodeError: raise LookupFailure(None, "failed to resolve %s" % service.service_type) logger.info("lookup %s %r", service_type, names) identities = set(service.identities()) for name in names: kwargs = self._get_service_znode(service, service_type, name) identity = kwargs.pop('identity') service.update(identity, **kwargs) try: identities.remove(identity) except KeyError: pass for identity in identities: service.remove(identity) return service def _get_zk_path(self, service_type, identity): return '%s/services/%s/%s' % (self.chroot, service_type, identity) def register(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) value = json.dumps({ 'endpoint': container.endpoint, 'identity': container.identity, 'log_endpoint': container.log_endpoint, }) result = self.client.create_async(path, value.encode('utf-8'), ephemeral=True, makepath=True) # FIXME: result.set_exception(RegistrationFailure()) result.get(timeout=timeout) def unregister(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) result = self.client.delete_async(path) result.set_exception(RegistrationFailure()) result.get(timeout=timeout)
class ZooKeeper(AbstractDCS): def __init__(self, config): super(ZooKeeper, self).__init__(config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) mapping = { 'use_ssl': 'use_ssl', 'verify': 'verify_certs', 'cacert': 'ca', 'cert': 'certfile', 'key': 'keyfile', 'key_password': '******' } kwargs = {v: config[k] for k, v in mapping.items() if k in config} self._client = KazooClient( hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']), timeout=config['ttl'], connection_retry=KazooRetry(max_delay=1, max_tries=-1, sleep_func=time.sleep), command_retry=KazooRetry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1, sleep_func=time.sleep), **kwargs) self._client.add_listener(self.session_listener) self._fetch_cluster = True self._fetch_optime = True self._orig_kazoo_connect = self._client._connection._connect self._client._connection._connect = self._kazoo_connect self._client.start() def _kazoo_connect(self, *args): """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no response on Ping after Ping interval (1/2 from read_timeout) it will consider current connection dead and try to connect to another node. Without this "magic" it was taking up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had only small time for reconnect and retry. This method is needed to return different value of read_timeout, which is not calculated from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and `write_leader_optime()` methods, which also may hang...""" ret = self._orig_kazoo_connect(*args) return max(self.loop_wait - 2, 2) * 1000, ret[1] def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def optime_watcher(self, event): self._fetch_optime = True self.event.set() def cluster_watcher(self, event): self._fetch_cluster = True self.optime_watcher(event) def reload_config(self, config): self.set_retry_timeout(config['retry_timeout']) loop_wait = config['loop_wait'] loop_wait_changed = self._loop_wait != loop_wait self._loop_wait = loop_wait self._client.handler.set_connect_timeout(loop_wait) # We need to reestablish connection to zookeeper if we want to change # read_timeout (and Ping interval respectively), because read_timeout # is calculated in `_kazoo_connect` method. If we are changing ttl at # the same time, set_ttl method will reestablish connection and return # `!True`, otherwise we will close existing connection and let kazoo # open the new one. if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed: self._client._connection._socket.close() def set_ttl(self, ttl): """It is not possible to change ttl (session_timeout) in zookeeper without destroying old session and creating the new one. This method returns `!True` if session_timeout has been changed (`restart()` has been called).""" if self._client._session_timeout != ttl: self._client._session_timeout = ttl self._client.restart() return True @property def ttl(self): return self._client._session_timeout def set_retry_timeout(self, retry_timeout): retry = self._client.retry if isinstance( self._client.retry, KazooRetry) else self._client._retry retry.deadline = retry_timeout def get_node(self, key, watch=None): try: ret = self._client.get(key, watch) return (ret[0].decode('utf-8'), ret[1]) except NoNodeError: return None def get_leader_optime(self, leader): watch = self.optime_watcher if not leader or leader.name != self._name else None optime = self.get_node(self.leader_optime_path, watch) self._fetch_optime = False return optime and int(optime[0]) or 0 @staticmethod def member(name, value, znode): return Member.from_node(znode.version, name, znode.ephemeralOwner, value) def get_children(self, key, watch=None): try: return self._client.get_children(key, watch) except NoNodeError: return [] def load_members(self, sync_standby): members = [] for member in self.get_children(self.members_path, self.cluster_watcher): watch = member in sync_standby and self.cluster_watcher or None data = self.get_node(self.members_path + member, watch) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set( self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get global dynamic configuration config = self.get_node( self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid) # get timeline history history = self.get_node( self.history_path, watch=self.cluster_watcher) if self._HISTORY in nodes else None history = history and TimelineHistory.from_node( history[1].mzxid, history[0]) # get synchronization state sync = self.get_node( self.sync_path, watch=self.cluster_watcher) if self._SYNC in nodes else None sync = SyncState.from_node(sync and sync[1].version, sync and sync[0]) # get list of members sync_standby = sync.leader == self._name and sync.members or [] members = self.load_members( sync_standby) if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node( self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if not self._ctl and leader[0] == self._name and client_id is not None \ and client_id[0] != leader[1].ephemeralOwner: logger.info( 'I am leader but not owner of the session. Removing leader node' ) self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # get last leader operation last_leader_operation = self._OPTIME in nodes and self.get_leader_optime( leader) # failover key failover = self.get_node( self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None failover = failover and Failover.from_node(failover[1].version, failover[0]) return Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history) def _load_cluster(self): cluster = self.cluster if self._fetch_cluster or cluster is None: try: cluster = self._client.retry(self._inner_load_cluster) except Exception: logger.exception('get_cluster') self.cluster_watcher(None) raise ZooKeeperError('ZooKeeper in not responding properly') # Optime ZNode was updated or doesn't exist and we are not leader elif (self._fetch_optime and not self._fetch_cluster or not cluster.last_leader_operation) and\ not (cluster.leader and cluster.leader.name == self._name): try: optime = self.get_leader_optime(cluster.leader) cluster = Cluster(cluster.initialize, cluster.config, cluster.leader, optime, cluster.members, cluster.failover, cluster.sync, cluster.history) except Exception: pass return cluster def _bypass_caches(self): self._fetch_cluster = True def _create(self, path, value, retry=False, ephemeral=False): try: if retry: self._client.retry(self._client.create, path, value, makepath=True, ephemeral=ephemeral) else: self._client.create_async(path, value, makepath=True, ephemeral=ephemeral).get(timeout=1) return True except Exception: logger.exception('Failed to create %s', path) return False def attempt_to_acquire_leader(self, permanent=False): ret = self._create(self.leader_path, self._name.encode('utf-8'), retry=True, ephemeral=not permanent) if not ret: logger.info('Could not take out TTL lock') return ret def _set_or_create(self, key, value, index=None, retry=False, do_not_create_empty=False): value = value.encode('utf-8') try: if retry: self._client.retry(self._client.set, key, value, version=index or -1) else: self._client.set_async(key, value, version=index or -1).get(timeout=1) return True except NoNodeError: if do_not_create_empty and not value: return True elif index is None: return self._create(key, value, retry) else: return False except Exception: logger.exception('Failed to update %s', key) return False def set_failover_value(self, value, index=None): return self._set_or_create(self.failover_path, value, index) def set_config_value(self, value, index=None): return self._set_or_create(self.config_path, value, index, retry=True) def initialize(self, create_new=True, sysid=""): sysid = sysid.encode('utf-8') return self._create(self.initialize_path, sysid, retry=True) if create_new \ else self._client.retry(self._client.set, self.initialize_path, sysid) def touch_member(self, data, permanent=False): cluster = self.cluster member = cluster and cluster.get_member(self._name, fallback_to_leader=False) encoded_data = json.dumps(data, separators=(',', ':')).encode('utf-8') if member and (self._client.client_id is not None and member.session != self._client.client_id[0] or not (deep_compare(member.data.get('tags', {}), data.get('tags', {})) and member.data.get('version') == data.get('version') and member.data.get('checkpoint_after_promote') == data.get('checkpoint_after_promote'))): try: self._client.delete_async(self.member_path).get(timeout=1) except NoNodeError: pass except Exception: return False member = None if member: if deep_compare(data, member.data): return True else: try: self._client.create_async( self.member_path, encoded_data, makepath=True, ephemeral=not permanent).get(timeout=1) return True except Exception as e: if not isinstance(e, NodeExistsError): logger.exception('touch_member') return False try: self._client.set_async(self.member_path, encoded_data).get(timeout=1) return True except Exception: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def _write_leader_optime(self, last_operation): return self._set_or_create(self.leader_optime_path, last_operation) def _update_leader(self): return True def _delete_leader(self): self._client.restart() return True def _cancel_initialization(self): node = self.get_node(self.initialize_path) if node: self._client.delete(self.initialize_path, version=node[1].version) def cancel_initialization(self): try: self._client.retry(self._cancel_initialization) except Exception: logger.exception("Unable to delete initialize key") def delete_cluster(self): try: return self._client.retry(self._client.delete, self.client_path(''), recursive=True) except NoNodeError: return True def set_history_value(self, value): return self._set_or_create(self.history_path, value) def set_sync_state_value(self, value, index=None): return self._set_or_create(self.sync_path, value, index, retry=True, do_not_create_empty=True) def delete_sync_state(self, index=None): return self.set_sync_state_value("{}", index) def watch(self, leader_index, timeout): if super(ZooKeeper, self).watch(leader_index, timeout) and not self._fetch_optime: self._fetch_cluster = True return self._fetch_cluster
class ZooKeeper(AbstractDCS): def __init__(self, name, config): super(ZooKeeper, self).__init__(name, config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) self.exhibitor = None if 'exhibitor' in config: exhibitor = config['exhibitor'] interval = exhibitor.get('poll_interval', 300) self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=interval) hosts = self.exhibitor.zookeeper_hosts self._client = KazooClient(hosts=hosts, timeout=(config.get('session_timeout') or 30), command_retry={'deadline': (config.get('reconnect_timeout') or 10), 'max_delay': 1, 'max_tries': -1}, connection_retry={'max_delay': 1, 'max_tries': -1}) self._client.add_listener(self.session_listener) self._my_member_data = None self._fetch_cluster = True self._last_leader_operation = 0 self._client.start() def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def cluster_watcher(self, event): self._fetch_cluster = True self.event.set() def get_node(self, key, watch=None): try: ret = self._client.get(key, watch) return (ret[0].decode('utf-8'), ret[1]) except NoNodeError: return None @staticmethod def member(name, value, znode): return Member.from_node(znode.version, name, znode.ephemeralOwner, value) def get_children(self, key, watch=None): try: return self._client.get_children(key, watch) except NoNodeError: return [] def load_members(self): members = [] for member in self.get_children(self.members_path, self.cluster_watcher): data = self.get_node(self.members_path + member) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set(self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get list of members members = self.load_members() if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node(self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner: logger.info('I am leader but not owner of the session. Removing leader node') self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # failover key failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None if failover: failover = Failover.from_node(failover[1].version, failover[0]) # get last leader operation optime = self.get_node(self.leader_optime_path) if self._OPTIME in nodes and self._fetch_cluster else None self._last_leader_operation = 0 if optime is None else int(optime[0]) self._cluster = Cluster(initialize, leader, self._last_leader_operation, members, failover) def _load_cluster(self): if self.exhibitor and self.exhibitor.poll(): self._client.set_hosts(self.exhibitor.zookeeper_hosts) if self._fetch_cluster or self._cluster is None: try: self._client.retry(self._inner_load_cluster) except: logger.exception('get_cluster') self.session_listener(KazooState.LOST) raise ZooKeeperError('ZooKeeper in not responding properly') def _create(self, path, value, **kwargs): try: self._client.retry(self._client.create, path, value.encode('utf-8'), **kwargs) return True except: return False def attempt_to_acquire_leader(self): ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=True) if not ret: logger.info('Could not take out TTL lock') return ret def set_failover_value(self, value, index=None): try: self._client.retry(self._client.set, self.failover_path, value.encode('utf-8'), version=index or -1) return True except NoNodeError: return value == '' or (not index and self._create(self.failover_path, value)) except: logging.exception('set_failover_value') return False def initialize(self, create_new=True, sysid=""): return self._create(self.initialize_path, sysid, makepath=True) if create_new \ else self._client.retry(self._client.set, self.initialize_path, sysid.encode("utf-8")) def touch_member(self, data, ttl=None): cluster = self.cluster member = cluster and ([m for m in cluster.members if m.name == self._name] or [None])[0] path = self.member_path data = data.encode('utf-8') if member and self._client.client_id is not None and member.session != self._client.client_id[0]: try: self._client.retry(self._client.delete, path) except NoNodeError: pass except: return False member = None if member and data == self._my_member_data: return True try: if member: self._client.retry(self._client.set, path, data) else: self._client.retry(self._client.create, path, data, makepath=True, ephemeral=True) self._my_member_data = data return True except NodeExistsError: try: self._client.retry(self._client.set, path, data) self._my_member_data = data return True except: logger.exception('touch_member') except: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def write_leader_optime(self, last_operation): last_operation = last_operation.encode('utf-8') if last_operation != self._last_leader_operation: self._last_leader_operation = last_operation path = self.leader_optime_path try: self._client.retry(self._client.set, path, last_operation) except NoNodeError: try: self._client.retry(self._client.create, path, last_operation, makepath=True) except: logger.exception('Failed to create %s', path) except: logger.exception('Failed to update %s', path) def update_leader(self): return True def delete_leader(self): self._client.restart() self._my_member_data = None return True def _cancel_initialization(self): node = self.get_node(self.initialize_path) if node: self._client.delete(self.initialize_path, version=node[1].version) def cancel_initialization(self): try: self._client.retry(self._cancel_initialization) except: logger.exception("Unable to delete initialize key") def delete_cluster(self): try: return self._client.retry(self._client.delete, self.client_path(''), recursive=True) except NoNodeError: return True def watch(self, timeout): if super(ZooKeeper, self).watch(timeout): self._fetch_cluster = True return self._fetch_cluster
#update try: result = None result = zk.set("/xy/test", b"some data") except Exception, e: print('exception when zk.set, %s' % e) else: print("zk.set /xy/test result %s" % str(result)) # del result = zk.delete("/xy/test/node", recursive=True) print("zk.delete /xy/test/node result %s" % (result)) # action try: result = zk.retry(zk.get, "/xy/test/nodex") except Exception, e: print('exception when zk.retry, %s' % e) else: print("zk.retry /xy/test/nodex result %s" % str(result)) from kazoo.retry import KazooRetry kr = KazooRetry(max_tries=3, ignore_expire=False) try: result = kr(zk.get, "/xy/test/nodex") except Exception, e: print('exception when KazooRetry, %s' % e) else: print("KazooRetry zk.get /xy/test/nodex result %s" % (result)) #watcher
class ZooKeeper(AbstractDCS): def __init__(self, name, config): super(ZooKeeper, self).__init__(name, config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) self.exhibitor = None if 'exhibitor' in config: exhibitor = config['exhibitor'] interval = exhibitor.get('poll_interval', 300) self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=interval) hosts = self.exhibitor.zookeeper_hosts self.client = KazooClient(hosts=hosts, timeout=(config.get('session_timeout', None) or 30), command_retry={ 'deadline': (config.get('reconnect_timeout', None) or 10), 'max_delay': 1, 'max_tries': -1}, connection_retry={'max_delay': 1, 'max_tries': -1}) self.client.add_listener(self.session_listener) self.cluster_event = self.client.handler.event_object() self.cluster = None self.fetch_cluster = True self.last_leader_operation = 0 self.client.start(None) def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def cluster_watcher(self, event): self.fetch_cluster = True self.cluster_event.set() def get_node(self, key, watch=None): try: ret = self.client.get(key, watch) return (ret[0].decode('utf-8'), ret[1]) except NoNodeError: return None @staticmethod def member(name, value, znode): conn_url, api_url = parse_connection_string(value) return Member(znode.version, name, conn_url, api_url, None, None) def get_children(self, key, watch=None): try: return self.client.get_children(key, watch) except NoNodeError: return [] def load_members(self): members = [] for member in self.get_children(self.members_path, self.cluster_watcher): data = self.get_node(self.members_path + member) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self.cluster_event.clear() nodes = set(self.get_children(self.client_path(''))) # get initialize flag initialize = self._INITIALIZE in nodes # get list of members members = self.load_members() if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node(self.leader_path, self.cluster_watcher) if self._LEADER in nodes else None if leader: client_id = self.client.client_id if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner: logger.info('I am leader but not owner of the session. Removing leader node') self.client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, None, None, None) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, None, None, member) self.fetch_cluster = member.index == -1 # get last leader operation self.last_leader_operation = self.get_node(self.leader_optime_path) if self.fetch_cluster else None self.last_leader_operation = 0 if self.last_leader_operation is None else int(self.last_leader_operation[0]) self.cluster = Cluster(initialize, leader, self.last_leader_operation, members) def get_cluster(self): if self.exhibitor and self.exhibitor.poll(): self.client.set_hosts(self.exhibitor.zookeeper_hosts) if self.fetch_cluster: try: self.client.retry(self._inner_load_cluster) except: logger.exception('get_cluster') self.session_listener(KazooState.LOST) raise ZooKeeperError('ZooKeeper in not responding properly') return self.cluster def _create(self, path, value, **kwargs): try: self.client.retry(self.client.create, path, value.encode('utf-8'), **kwargs) return True except: return False def attempt_to_acquire_leader(self): ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=True) ret or logger.info('Could not take out TTL lock') return ret def initialize(self): return self._create(self.initialize_path, self._name, makepath=True) def touch_member(self, connection_string, ttl=None): if self.cluster and any(m.name == self._name for m in self.cluster.members): return True path = self.member_path connection_string = connection_string.encode('utf-8') try: self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True) return True except NodeExistsError: try: self.client.retry(self.client.delete, path) self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True) return True except: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def update_leader(self, state_handler): last_operation = state_handler.last_operation().encode('utf-8') if last_operation != self.last_leader_operation: self.last_leader_operation = last_operation path = self.leader_optime_path try: self.client.retry(self.client.set, path, last_operation) except NoNodeError: try: self.client.retry(self.client.create, path, last_operation, makepath=True) except: logger.exception('Failed to create %s', path) except: logger.exception('Failed to update %s', path) return True def delete_leader(self): if isinstance(self.cluster, Cluster) and self.cluster.leader.name == self._name: self.client.delete(self.leader_path, version=self.cluster.leader.index) def _cancel_initialization(self): node = self.get_node(self.initialize_path) if node and node[0] == self._name: self.client.delete(self.initialize_path, version=node[1].version) def cancel_initialization(self): try: self.client.retry(self._cancel_initialization) except: logger.exception("Unable to delete initialize key") def watch(self, timeout): self.cluster_event.wait(timeout) if self.cluster_event.isSet(): self.fetch_cluster = True
class Elector(threading.Thread): LOCKING, NOLOCK = 1,2 def __init__(self, zha): threading.Thread.__init__(self) self.zha = zha self.should_run = True self.in_entry_act = False self.in_entry_sby = False self.state = Elector.NOLOCK self.zk = KazooClient(hosts=self.zha.config.get("connection_string","127.0.0.1:2181"), logger=logger) self.zk.add_listener(self.zk_listener) self.zk.start() self.id = self.zha.config.get("id") self.lock = self.zk.Lock(self.zha.config.get("lock_znode","/zha-lock"), self.id) self.abcpath = self.zha.config.get("abc_znode","/zha-abc") #callbacks def on_become_active(self): if self.zha.config.become_active() == 0: logger.info("successfully become active") self.zha.set_state("ACT:HEALTHY") return True else: logger.info("activation failed..") return False def on_become_active_to_standby(self): self.zha.set_state("SBY:UNKNOWN") # state changed to SBY anyway. if self.zha.config.become_standby_from_active() == 0: logger.info("successfully become standby") return True else: logger.info("could not retire cleanly...") return False def on_fence(self): if self.zha.config.trigger_fence() == 0: logger.info("shooted the node") return True else: logger.info("could not retire cleanly...") return False def run(self): while self.should_run: self.in_elector_loop() time.sleep(self.zha.config.get("elector_interval",3)) self.retire() self.zk.stop() logger.info("elector thread stopped.") def in_elector_loop(self): if self.zk.state != KazooState.CONNECTED: # zk listener will callback on LOST, so no need to call self.retire(), # but it takes a bit long to be LOST. Mostly other zha will fence me. return #for locker if self.state == Elector.LOCKING: if self.in_entry_act is False: self.retire() return return #for waiters try: lock_result = self.lock.acquire(timeout=self.zha.config.get("elector_interval",3)) except LockTimeout: self.retire() logger.info("lock timeout") return if self.in_entry_act is False: self.retire() return if self.handle_abc() is False: self.retire() return if self.on_become_active() is False: self.zk_delete_my_abc() self.retire() return # if reached here, all done with lock self.state = Elector.LOCKING def zk_listener(self,zkstate): logger.info("zookeeper connection state changed %s"%(zkstate,) ) if zkstate == KazooState.LOST: logger.info("(connection to zookeeper is lost/closed)") if self.state != Elector.LOCKING: return logger.info("become standby due to zk connection problem.") self.on_become_active_to_standby() self.state = Elector.NOLOCK elif zkstate == KazooState.SUSPENDED: return else: return def retire(self): if self.state == Elector.LOCKING: if self.on_become_active_to_standby(): self.zk_delete_my_abc() #dont care it succeeds or not, that is, may become standby leaving abc behind. self.state = Elector.NOLOCK self.lock.release() def handle_abc(self): if not self.zk.retry(self.zk.exists,self.abcpath): self.zk.retry(self.zk.create, self.abcpath, self.id) return True data, stat = self.zk.retry(self.zk.get, self.abcpath) if data.strip()==self.id: return True else: if self.on_fence() is False: return False self.zk.retry(self.zk.set, self.abcpath, self.id) return True def zk_delete_my_abc(self): try: data, stat = self.zk.get(self.abcpath) assert data.strip() == self.id self.zk.delete(self.abcpath) return True except: return False
class DeploymentConfig(object): """ Accesses deployment configuration options. """ # The ZooKeeper node where configuration is stored. CONFIG_ROOT = '/appscale/config' def __init__(self, hosts): """ Creates new DeploymentConfig object. Args: hosts: A list of ZooKeeper hosts. """ self.logger = logging.getLogger(self.__class__.__name__) self.update_lock = Lock() self.state = ConfigStates.LOADING self.config = {} self.conn = KazooClient(hosts=hosts, read_only=True) self.conn.add_listener(self._conn_listener) self.conn.start() self.conn.ensure_path(self.CONFIG_ROOT) self.conn.ChildrenWatch(self.CONFIG_ROOT, func=self._update_config) def _conn_listener(self, state): """ Handles changes in ZooKeeper connection state. Args: state: A string indicating the new state. """ if state == KazooState.LOST: self.logger.warning('ZK connection lost') if state == KazooState.SUSPENDED: self.logger.warning('ZK connection suspended') else: self.logger.info('ZK connection established') def _load_child(self, child): """ Fetches the data for a configuration node. Args: child: A string containing the ZooKeeper node to fetch. Returns: A dictionary containing configuration data. Raises: InaccessibleConfig if ZooKeeper is not accessible. """ node = '/'.join([self.CONFIG_ROOT, child]) try: data, _ = self.conn.retry(self.conn.get, node) except (KazooException, ZookeeperError): raise ConfigInaccessible('ZooKeeper connection not available') except NoNodeError: return {} try: return json.loads(data) except ValueError: self.logger.warning('Invalid deployment config: {}'.format(child)) return {} def _update_config(self, children): """ Updates configuration when it changes. Args: children: A list of ZooKeeper nodes. """ with self.update_lock: self.state = ConfigStates.LOADING # Ensure old sections are removed. self.config = {} for child in children: while True: try: self.config[child] = self._load_child(child) break except ConfigInaccessible as load_error: self.logger.warning(str(load_error)) time.sleep(SMALL_WAIT) self.logger.info('Deployment configuration updated') self.state = ConfigStates.LOADED def get_config(self, section): """ Fetches the configuration for a given section. Args: section: A string specifying the section to fetch. Returns: A dictionary containing configuration data. Raises: InaccessibleConfig if ZooKeeper is inaccessible. """ # If the connection is established, it should finish loading very soon. while (self.state == ConfigStates.LOADING and self.conn.state not in (KazooState.LOST, KazooState.SUSPENDED)): time.sleep(TINY_WAIT) if self.state != ConfigStates.LOADED: raise ConfigInaccessible('ZooKeeper connection not available') with self.update_lock: if section not in self.config: return {} return self.config[section] def close(self): """ Close the ZooKeeper connection. """ self.conn.stop()
class ZooKeeper(AbstractDCS): def __init__(self, name, config): super(ZooKeeper, self).__init__(name, config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) self.exhibitor = None if 'exhibitor' in config: exhibitor = config['exhibitor'] interval = exhibitor.get('poll_interval', 300) self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=interval) hosts = self.exhibitor.zookeeper_hosts self.client = KazooClient(hosts=hosts, timeout=(config.get('session_timeout', None) or 30), command_retry={ 'deadline': (config.get('reconnect_timeout', None) or 10), 'max_delay': 1, 'max_tries': -1 }, connection_retry={ 'max_delay': 1, 'max_tries': -1 }) self.client.add_listener(self.session_listener) self.cluster_event = self.client.handler.event_object() self.fetch_cluster = True self.members = [] self.leader = None self.last_leader_operation = 0 self.client.start(None) def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def cluster_watcher(self, event): self.fetch_cluster = True self.cluster_event.set() def get_node(self, name, watch=None): try: return self.client.get(self.client_path(name), watch) except NoNodeError: pass except: logger.exception('get_node') return None @staticmethod def member(name, value, znode): conn_url, api_url = parse_connection_string(value) return Member(znode.mzxid, name, conn_url, api_url, None, None) def load_members(self): members = [] for member in self.client.get_children(self.client_path('/members'), self.cluster_watcher): data = self.get_node('/members/' + member) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self.cluster_event.clear() leader = self.get_node('/leader', self.cluster_watcher) self.members = self.load_members() if leader: if leader[0] == self._name: client_id = self.client.client_id if client_id is not None and client_id[0] != leader[ 1].ephemeralOwner: logger.info( 'I am leader but not owner of the session. Removing leader node' ) self.client.delete(self.client_path('/leader')) leader = None if leader: for member in self.members: if member.name == leader[0]: leader = member self.fetch_cluster = False break if not isinstance(leader, Member): leader = Member(-1, leader, None, None, None, None) self.leader = leader if self.fetch_cluster: last_leader_operation = self.get_node('/optime/leader') if last_leader_operation: self.last_leader_operation = int(last_leader_operation[0]) def get_cluster(self): if self.exhibitor and self.exhibitor.poll(): self.client.set_hosts(self.exhibitor.zookeeper_hosts) if self.fetch_cluster: try: self.client.retry(self._inner_load_cluster) except: logger.exception('get_cluster') self.session_listener(KazooState.LOST) raise ZooKeeperError('ZooKeeper in not responding properly') return Cluster(True, self.leader, self.last_leader_operation, self.members) def _create(self, path, value, **kwargs): try: self.client.retry(self.client.create, self.client_path(path), value, **kwargs) return True except: return False def attempt_to_acquire_leader(self): ret = self._create('/leader', self._name, makepath=True, ephemeral=True) ret or logger.info('Could not take out TTL lock') return ret def race(self, path): return self._create(path, self._name, makepath=True) def touch_member(self, connection_string, ttl=None): for m in self.members: if m.name == self._name: return True path = self.client_path('/members/' + self._name) try: self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True) return True except NodeExistsError: try: self.client.retry(self.client.delete, path) self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True) return True except: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def update_leader(self, state_handler): last_operation = state_handler.last_operation() if last_operation != self.last_leader_operation: self.last_leader_operation = last_operation path = self.client_path('/optime/leader') try: self.client.retry(self.client.set, path, last_operation) except NoNodeError: try: self.client.retry(self.client.create, path, last_operation, makepath=True) except: logger.exception('Failed to create %s', path) except: logger.exception('Failed to update %s', path) return True def delete_leader(self): if isinstance(self.leader, Member) and self.leader.name == self._name: self.client.delete(self.client_path('/leader')) def sleep(self, timeout): self.cluster_event.wait(timeout) if self.cluster_event.isSet(): self.fetch_cluster = True
class ZkCoordinator(StaticCoordinator): @staticmethod def fromGroup(zkConnect: str, group: str) -> 'ZkCoordinator': """Convenience method for instantiation using conventional paths based on group. The path convention is: /static_assignment/[group]/assignments /static_assignment/[group]/members Args: zkConnect (str): Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). group (str): The name of the consumer group this coordinator belongs to. Must not be None. """ if group is None or len(group.strip()) == 0: raise ValueError('ZkCoordinator: Invalid `group` argument, it must not be None or blank.') prePath = f'/static_assignment/{group.strip()}' assignmentPath = f'{prePath}/assignment' membersPath = f'{prePath}/members' return ZkCoordinator(zkConnect, membersPath, assignmentPath) def __init__(self, zkConnect: str, membersPath: str, assignmentsPath: str): """Zookeeper implementation of `StaticCoordinator` Args: zkConnect (str): Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). membersPath (str): Zookeeper path at which members will create ephemeral nodes asserting their ID. assignmentsPath (str): Zookeeper path at which the current assignments are kept. """ for val, name in ((zkConnect, 'zkConnect'), (membersPath, 'membersPath'), (assignmentsPath, 'assignmentsPath')): if val is None or len(val.strip()) == 0: raise ValueError(f'ZkCoordinator: Invalid `{name}` argument, it must not be None or blank') logger.info('ZKCoordinator starting with, membersPath=%s, assignmentsPath=%s', membersPath, assignmentsPath) self._zkConnect = zkConnect self._membersPath = membersPath self._membersPathEnsured = False self._assignmentsPath = assignmentsPath self._assignmentsPathEnsured = False self._currentAssignment = None self._assignmentsWatcher = None self._memberMetaData: Optional[StaticMemberMeta] = None self.zk = KazooClient(hosts=zkConnect) self.zk.add_listener(self._zkListener()) self._memberId: Optional[MemberId] = None def _zkListener(self): def listener(state): if state == KazooState.LOST: self._memberId = None self._currentAssignment = None return listener def _establishSession(self): if self._assignmentsWatcher is None: # add watch for assignment updates def watchAssignments(data, stat, event): self._currentAssignment = self._processAssignmentsData(data) logger.info('Assignment update received. | assignments= %s', self._currentAssignment) self._ensureAssignmentsPath() self._assignmentWatcher = DataWatch(self.zk, self._assignmentsPath, watchAssignments) def _ensureAssignmentsPath(self): if not self._assignmentsPathEnsured: self.zk.ensure_path(self._assignmentsPath) self._assignmentsPathEnsured = True def _fetchAssignments(self) -> Optional[Assignments]: return self._currentAssignment def _processAssignmentsData(self, rawData): if rawData is not None: return Assignments.fromJson(rawData.decode('utf-8')) def _ensureMembersPath(self): if not self._membersPathEnsured: self.zk.ensure_path(self._membersPath) self._membersPathEnsured = True def _createPath(self, altMemberId: MemberId = None): mid = self._memberId if altMemberId is not None: mid = altMemberId if mid is not None: return f'{self._membersPath}/{mid}' return None def _encodeMemberData(self, meta: StaticMemberMeta): return ujson.dumps(meta.asDict()).encode('utf-8') def _compareAndUpdateMemberData(self, meta: StaticMemberMeta): newDict = None selfDict = None if self._memberMetaData is not None and meta is not None: selfDict = self._memberMetaData.asDict() newDict = meta.asDict() isDiff = ( selfDict['hostId'] != newDict['hostId'] or selfDict['assignment']['configVersion'] != newDict['assignment']['configVersion'] or selfDict['assignment']['version'] != newDict['assignment']['version'] ) else: isDiff = True if isDiff: self._memberMetaData = meta path = self._createPath() if path is not None: def cb(async_obj): try: async_obj.get() logger.info('Member meta data updated. | metaData=%s', meta) except (ConnectionLoss, SessionExpiredError): logger.exception('Failed to update member meta data.') self.zk.set_async(path, self._encodeMemberData(meta)).rawlink(cb) def updateAssignments(self, meta: StaticMemberMeta, newAssignments: Assignments): self.zk.retry(self._innerUpdateAssignment, newAssignments) def _innerUpdateAssignment(self, assignment: Assignments): self._ensureAssignmentsPath() self.zk.set(self._assignmentsPath, assignment.asJson().encode('utf-8')) logger.info('Assignments updated. | assignments=%s', assignment) def leave(self, meta: StaticMemberMeta): self.zk.retry(self._innerLeave) def _innerLeave(self): path = self._createPath() if path is not None: try: self.zk.delete(path) except (ConnectionLoss, SessionExpiredError): logger.exception( 'Failed to relinquish member ID, ' "will assume ephemeral node will expire on it's own. " '| memberId=%s', self._memberId, ) self._memberId = None def join(self, meta: StaticMemberMeta): asgns = self._fetchAssignments() if asgns is None: logger.warning('Cannot join a group without assignments. | assignmentsPath=%s', self._assignmentsPath) return None if self._memberId is None: self._memberId = self._inner_join(meta, asgns.maxMembers) return self._memberId def _inner_join(self, meta: StaticMemberMeta, maxMembers: int) -> Optional[MemberId]: idList = range(maxMembers) memberData = self._encodeMemberData(meta) self._ensureMembersPath() foundMid = None for mid in idList: memberIdPath = self._createPath(mid) try: self.zk.create(memberIdPath, memberData, ephemeral=True) foundMid = mid logging.debug('Member id acquired. | memberId=%s', mid) break except NodeExistsError: # move onto the next node logger.debug('Member id already taken moving to next. | memberId=%s', mid) except (ConnectionLoss, SessionExpiredError): logger.exception('Member id acquisition attempt failed with error.') time.sleep(1) self._memberMetaData = meta return foundMid def assignments(self, meta: StaticMemberMeta) -> Optional[Assignments]: self._compareAndUpdateMemberData(meta) return self._fetchAssignments() def heartbeat(self, meta: StaticMemberMeta) -> Optional[MemberId]: self._compareAndUpdateMemberData(meta) return self._memberId def stop(self): self.zk.stop() self.zk.close() def start(self): self.zk.start() self._establishSession()
class ZookeeperServiceRegistry(BaseServiceRegistry): def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT): super(ZookeeperServiceRegistry, self).__init__() self.chroot = chroot self.client = KazooClient( hosts=hosts, handler=SequentialGeventHandler(), ) self.client.add_listener(self.on_kazoo_state_change) self.start_count = 0 @classmethod def from_config(cls, config, **kwargs): return cls( hosts=config.get('hosts', DEFAULT_HOSTS), chroot=config.get('chroot', DEFAULT_CHROOT), **kwargs ) def on_start(self, timeout=10): self.start_count += 1 if self.start_count > 1: return started = self.client.start_async() started.wait(timeout=timeout) if not self.client.connected: raise RuntimeError('could not connect to zookeeper') logger.debug('connected to zookeeper (version=%s)', '.'.join(map(str, self.client.server_version()))) def on_stop(self): self.start_count -= 1 if self.start_count != 0: return self.client.stop() def on_kazoo_state_change(self, state): logger.info('kazoo connection state changed to %s', state) def on_service_type_watch(self, service, event): try: if event.type == EventType.CHILD: # FIXME: figure out proper retry strategy self.client.retry(self.lookup, service.container, service) except Exception: logger.exception('error in service type watcher') def on_service_watch(self, service, event): try: prefix, service_type, identity = event.path.rsplit('/', 2) if event.type == EventType.DELETED: service.remove(identity) except Exception: logger.exception('error in service watcher') def _get_service_znode(self, service, service_type, identity): path = self._get_zk_path(service_type, identity) result = self.client.get_async( path, watch=functools.partial(self.on_service_watch, service)) value, znode = result.get() items = six.iteritems(json.loads(value.decode('utf-8'))) return {str(k): str(v) for k, v in items} def discover(self, container): result = self.client.get_children_async( path='%s/services' % self.chroot, ) return list(result.get()) def lookup(self, container, service, watch=True, timeout=1): def child_watch(event): print(event) service_type = service.service_type result = self.client.get_children_async( path='%s/services/%s' % (self.chroot, service_type), watch=functools.partial(self.on_service_type_watch, service), ) try: names = result.get(timeout=timeout) except NoNodeError: raise LookupFailure(None, "failed to resolve %s" % service.service_type) logger.info("lookup %s %r", service_type, names) identities = set(service.identities()) for name in names: kwargs = self._get_service_znode(service, service_type, name) identity = kwargs.pop('identity') service.update(identity, **kwargs) try: identities.remove(identity) except KeyError: pass for identity in identities: service.remove(identity) return service def _get_zk_path(self, service_type, identity): return '%s/services/%s/%s' % (self.chroot, service_type, identity) def register(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) value = json.dumps({ 'endpoint': container.endpoint, 'identity': container.identity, 'log_endpoint': container.log_endpoint, }) result = self.client.create_async( path, value.encode('utf-8'), ephemeral=True, makepath=True) # FIXME: result.set_exception(RegistrationFailure()) result.get(timeout=timeout) def unregister(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) result = self.client.delete_async(path) result.set_exception(RegistrationFailure()) result.get(timeout=timeout)
class _ZookeeperProxy(object): def __init__(self, address_provider: AddressListProvider, prefix: str): self.address_provider = address_provider self.async_counter = WaitingCounter(limit=100) self.conn_str = None self.client = None self.prefix = prefix self.hosts_cache = SlowlyUpdatedCache( self.address_provider.get_latest_address, self._update_hosts, 30, # Refresh every 30 seconds 3 * 60) # Update only after 180 seconds of stability def _update_hosts(self, value): hosts, port = value if hosts: self.conn_str = ','.join(['{}:{}'.format(h, port) for h in hosts]) + self.prefix if self.client is None: self.client = KazooClient(hosts=self.conn_str, command_retry={ 'deadline': 120, 'max_delay': 1, 'max_tries': -1 }, connection_retry={ 'max_delay': 1, 'max_tries': -1 }) self.client.add_listener(self.session_listener) else: self.client.stop() self.client.set_hosts(self.conn_str) self.client.start() def terminate(self): if self.client: self.client.stop() def session_listener(self, state): pass def get_conn_str(self): return self.conn_str def get(self, *params): self.hosts_cache.touch() return self.client.retry(self.client.get, *params) def get_async(self, *params): # Exhibitor is not polled here and it's totally fine! self.async_counter.increment() try: i_async = self.client.get_async(*params) i_async.rawlink(self._decrement) return i_async except Exception as e: self._decrement() raise e def _decrement(self, *args, **kwargs): self.async_counter.decrement() def set(self, *args, **kwargs): self.hosts_cache.touch() return self.client.retry(self.client.set, *args, **kwargs) def create(self, *args, **kwargs): self.hosts_cache.touch() return self.client.retry(self.client.create, *args, **kwargs) def delete(self, *args, **kwargs): self.hosts_cache.touch() try: return self.client.retry(self.client.delete, *args, **kwargs) except NoNodeError: pass def get_children(self, *params): self.hosts_cache.touch() try: return self.client.retry(self.client.get_children, *params) except NoNodeError: return [] def take_lock(self, *args, **kwargs): while True: try: self.hosts_cache.touch() return self.client.Lock(*args, **kwargs) except Exception as e: _LOG.error('Failed to obtain lock for exhibitor, retrying', exc_info=e)
class ZkDefinitionWatcher: def __init__(self, zk_hosts, def_path='/env/?/sd/!'): c_retry = KazooRetry(-1, max_delay=60) self._zk = KazooClient(','.join(zk_hosts), read_only=True, connection_retry=c_retry) self._zk.start(timeout=10) self._child_watchers = {} self._data_watchers = {} # Note about "stack": the stack is a ZK path of where to find service definitions. It solves the problem of a # tree watcher watching an entire tree. A node is only watched if it adhears to the stack pattern. A "?" means # "watch children of whatever is here" (this must be the environment). A "!" means "watch data of whatever is # here" (this must be the service defs). Anything else is a constant, so it will only watch the children of that # specific node if it exists. self._stack = tuple(def_path.strip('/').split('/')) we = WatchedEvent(None, None, '/' + self._stack[0]) self._watch_children(we) def poll(self): env_index = self._stack.index('?') name_index = self._stack.index('!') ret = {} for path, data in self._data_watchers.iteritems(): # Get env from path env = path.strip('/').split('/')[env_index] # Get name from path name = path.strip('/').split('/')[name_index] data['path'] = path ret[(env, name)] = data return ret def _watch_children(self, event): # Called when a child node is deleted if event.type == EventType.DELETED: # Remove child watcher from our records del self._child_watchers[event.path] # remove datawatchers? return # Get children and set a child watch for the next event of this path children = self._zk.retry(self._zk.get_children, event.path, watch=self._watch_children) # Update our records self._child_watchers[event.path] = children # If no children, there is nothing to do; no watchers to set if len(children) == 0: return # Find location in stack for children level = len(event.path.strip('/').split('/')) child_depth_marker = self._stack[level] for child in children: path = "{0}/{1}".format(event.path, child) if child_depth_marker == '?' or child == child_depth_marker: # Set child_watcher for each child if path not in self._child_watchers: we = WatchedEvent(None, None, path) self._watch_children(we) elif child_depth_marker == '!': # Set data_watcher for each child if path not in self._data_watchers: we = WatchedEvent(None, None, path) self._watch_data(we) def _watch_data(self, event): # Called when a child node is deleted if event.type == EventType.DELETED: # Remove child watcher from our records del self._data_watchers[event.path] return # Get data and set a data watch for the next event of this path data, _stat = self._zk.retry(self._zk.get, event.path, watch=self._watch_data) # Update our records try: # TODO: validate JSON? parsed_data = json.loads(data) self._data_watchers[event.path] = parsed_data except ValueError: logger.warning('Service definition "' + event.path + '" cannot be parsed as JSON') # If service def isn't in proper format, remove it from known service defs if event.path in self._data_watchers: del self._data_watchers[event.path]
if zk_client.exists('/node1'): print('存在节点node1,节点路径/node1') data, stat = zk_client.get('/node1') if stat: print("Version: %s, data: %s" % (stat.version, data.decode("utf-8"))) children = zk_client.get_children('/node1') print('node1子节点 有 %s 子节点,节点名称为: %s' % (len(children), children)) print('/ 子节点', zk_client.get_children('/')) zk_client.set('/node1/subNode2', b'some new data') zk_client.delete('/node1', recursive=True) try: result = zk_client.retry(zk_client.get, '/node1/subNode3') print(result) kr = KazooRetry(max_tries=3, ignore_expired=False) result = kr(zk_client.get, '/node1/subNode3') except Exception as e: print('/node1/subNode3 不存在,所以会运行出错') zk_client.stop() while zk_conn_status != 3: continue else: i = 0 while i < 300: if i % 20 == 0: time.sleep(2) print('创建新节点')
zk.start() base_zk_path = '%s/%s' % (service_ns, service_id) def resolve_path(path): rel_path = relpath(path, config_dir) return base_zk_path if rel_path == '.' else join(base_zk_path, rel_path) if exists(config_dir) and isdir(config_dir): print >>stderr, 'Acquiring access lock...' with zk.Lock(base_zk_path + '.lock', node_id): for dirname, dirs, files in os.walk(config_dir): zk.ensure_path(resolve_path(dirname)) print >>stderr, ' Directory zk://' + resolve_path(dirname) for filename in files: filename = join(dirname, filename) config_path = resolve_path(filename) value = open(filename, 'rb').read() if zk.exists(config_path): print >>stderr, ' Updating zk://%s from %s [%d bytes]' % (config_path, filename, len(value)) zk.retry(zk.set, config_path, value) else: print >>stderr, ' Creating zk://%s from %s [%d bytes]' % (config_path, filename, len(value)) zk.retry(zk.create, config_path, value) else: print >>stderr, 'Invalid configuration directory' success = True zk.stop();
class ZooKeeper(AbstractDCS): def __init__(self, config): super(ZooKeeper, self).__init__(config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) self._client = KazooClient(hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']), timeout=config['ttl'], connection_retry={'max_delay': 1, 'max_tries': -1}, command_retry={'deadline': config['retry_timeout'], 'max_delay': 1, 'max_tries': -1}) self._client.add_listener(self.session_listener) self._my_member_data = None self._fetch_cluster = True self._last_leader_operation = 0 self._orig_kazoo_connect = self._client._connection._connect self._client._connection._connect = self._kazoo_connect self._client.start() def _kazoo_connect(self, host, port): """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no response on Ping after Ping interval (1/2 from read_timeout) it will consider current connection dead and try to connect to another node. Without this "magic" it was taking up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had only small time for reconnect and retry. This method is needed to return different value of read_timeout, which is not calculated from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and `write_leader_optime()` methods, which also may hang...""" ret = self._orig_kazoo_connect(host, port) return max(self.loop_wait - 2, 2)*1000, ret[1] def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def cluster_watcher(self, event): self._fetch_cluster = True self.event.set() def reload_config(self, config): self.set_retry_timeout(config['retry_timeout']) loop_wait = config['loop_wait'] loop_wait_changed = self._loop_wait != loop_wait self._loop_wait = loop_wait self._client.handler.set_connect_timeout(loop_wait) # We need to reestablish connection to zookeeper if we want to change # read_timeout (and Ping interval respectively), because read_timeout # is calculated in `_kazoo_connect` method. If we are changing ttl at # the same time, set_ttl method will reestablish connection and return # `!True`, otherwise we will close existing connection and let kazoo # open the new one. if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed: self._client._connection._socket.close() def set_ttl(self, ttl): """It is not possible to change ttl (session_timeout) in zookeeper without destroying old session and creating the new one. This method returns `!True` if session_timeout has been changed (`restart()` has been called).""" if self._client._session_timeout != ttl: self._client._session_timeout = ttl self._client.restart() return True def set_retry_timeout(self, retry_timeout): self._client._retry.deadline = retry_timeout def get_node(self, key, watch=None): try: ret = self._client.get(key, watch) return (ret[0].decode('utf-8'), ret[1]) except NoNodeError: return None @staticmethod def member(name, value, znode): return Member.from_node(znode.version, name, znode.ephemeralOwner, value) def get_children(self, key, watch=None): try: return self._client.get_children(key, watch) except NoNodeError: return [] def load_members(self): members = [] for member in self.get_children(self.members_path, self.cluster_watcher): data = self.get_node(self.members_path + member) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set(self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get global dynamic configuration config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid) # get list of members members = self.load_members() if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node(self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner: logger.info('I am leader but not owner of the session. Removing leader node') self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # failover key failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None failover = failover and Failover.from_node(failover[1].version, failover[0]) # get last leader operation optime = self.get_node(self.leader_optime_path) if self._OPTIME in nodes and self._fetch_cluster else None self._last_leader_operation = 0 if optime is None else int(optime[0]) self._cluster = Cluster(initialize, config, leader, self._last_leader_operation, members, failover) def _load_cluster(self): if self._fetch_cluster or self._cluster is None: try: self._client.retry(self._inner_load_cluster) except Exception: logger.exception('get_cluster') self.cluster_watcher(None) raise ZooKeeperError('ZooKeeper in not responding properly') def _create(self, path, value, **kwargs): try: self._client.retry(self._client.create, path, value.encode('utf-8'), **kwargs) return True except: return False def attempt_to_acquire_leader(self, permanent=False): ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=not permanent) if not ret: logger.info('Could not take out TTL lock') return ret def set_failover_value(self, value, index=None): try: self._client.retry(self._client.set, self.failover_path, value.encode('utf-8'), version=index or -1) return True except NoNodeError: return value == '' or (index is None and self._create(self.failover_path, value)) except: logging.exception('set_failover_value') return False def set_config_value(self, value, index=None): try: self._client.retry(self._client.set, self.config_path, value.encode('utf-8'), version=index or -1) return True except NoNodeError: return index is None and self._create(self.config_path, value) except Exception: logging.exception('set_config_value') return False def initialize(self, create_new=True, sysid=""): return self._create(self.initialize_path, sysid, makepath=True) if create_new \ else self._client.retry(self._client.set, self.initialize_path, sysid.encode("utf-8")) def touch_member(self, data, ttl=None, permanent=False): cluster = self.cluster member = cluster and ([m for m in cluster.members if m.name == self._name] or [None])[0] data = data.encode('utf-8') if member and self._client.client_id is not None and member.session != self._client.client_id[0]: try: self._client.delete_async(self.member_path).get(timeout=1) except NoNodeError: pass except: return False member = None if member: if data == self._my_member_data: return True else: try: self._client.create_async(self.member_path, data, makepath=True, ephemeral=not permanent).get(timeout=1) self._my_member_data = data return True except Exception as e: if not isinstance(e, NodeExistsError): logger.exception('touch_member') return False try: self._client.set_async(self.member_path, data).get(timeout=1) self._my_member_data = data return True except: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def write_leader_optime(self, last_operation): last_operation = last_operation.encode('utf-8') if last_operation != self._last_leader_operation: try: self._client.set_async(self.leader_optime_path, last_operation).get(timeout=1) self._last_leader_operation = last_operation except NoNodeError: try: self._client.create_async(self.leader_optime_path, last_operation, makepath=True).get(timeout=1) self._last_leader_operation = last_operation except: logger.exception('Failed to create %s', self.leader_optime_path) except: logger.exception('Failed to update %s', self.leader_optime_path) def update_leader(self): return True def delete_leader(self): self._client.restart() self._my_member_data = None return True def _cancel_initialization(self): node = self.get_node(self.initialize_path) if node: self._client.delete(self.initialize_path, version=node[1].version) def cancel_initialization(self): try: self._client.retry(self._cancel_initialization) except: logger.exception("Unable to delete initialize key") def delete_cluster(self): try: return self._client.retry(self._client.delete, self.client_path(''), recursive=True) except NoNodeError: return True def watch(self, timeout): if super(ZooKeeper, self).watch(timeout): self._fetch_cluster = True return self._fetch_cluster
try: result = None result = zk.set("/xy/test", b"some data") except Exception, e: print('exception when zk.set, %s' % e) else: print("zk.set /xy/test result %s" % str(result)) # del result = zk.delete("/xy/test/node", recursive=True) print("zk.delete /xy/test/node result %s" % (result)) # action try: result = zk.retry(zk.get, "/xy/test/nodex") except Exception, e: print('exception when zk.retry, %s' % e) else: print("zk.retry /xy/test/nodex result %s" % str(result)) from kazoo.retry import KazooRetry kr = KazooRetry(max_tries=3, ignore_expire=False) try: result = kr(zk.get, "/xy/test/nodex") except Exception, e: print('exception when KazooRetry, %s' % e) else: print("KazooRetry zk.get /xy/test/nodex result %s" % (result))
class _ZookeeperProxy(object): def __init__(self, address_provider: AddressListProvider, prefix: str): self.address_provider = address_provider self.async_counter = WaitingCounter(limit=100) self.conn_str = None self.client = None self.prefix = prefix self.hosts_cache = SlowlyUpdatedCache( self.address_provider.get_latest_address, self._update_hosts, 30, # Refresh every 30 seconds 3 * 60) # Update only after 180 seconds of stability def _update_hosts(self, value): hosts, port = value if hosts: self.conn_str = ','.join(['{}:{}'.format(h, port) for h in hosts]) + self.prefix if self.client is None: self.client = KazooClient(hosts=self.conn_str, command_retry={'deadline': 120, 'max_delay': 1, 'max_tries': -1}, connection_retry={'max_delay': 1, 'max_tries': -1}) self.client.add_listener(self.session_listener) else: self.client.stop() self.client.set_hosts(self.conn_str) self.client.start() def terminate(self): if self.client: self.client.stop() def session_listener(self, state): pass def get_conn_str(self): return self.conn_str def get(self, *params): self.hosts_cache.touch() return self.client.retry(self.client.get, *params) def get_async(self, *params): # Exhibitor is not polled here and it's totally fine! self.async_counter.increment() try: i_async = self.client.get_async(*params) i_async.rawlink(self._decrement) return i_async except Exception as e: self._decrement() raise e def _decrement(self, *args, **kwargs): self.async_counter.decrement() def set(self, *args, **kwargs): self.hosts_cache.touch() return self.client.retry(self.client.set, *args, **kwargs) def create(self, *args, **kwargs): self.hosts_cache.touch() return self.client.retry(self.client.create, *args, **kwargs) def delete(self, *args, **kwargs): self.hosts_cache.touch() try: return self.client.retry(self.client.delete, *args, **kwargs) except NoNodeError: pass def get_children(self, *args, **kwargs): self.hosts_cache.touch() try: return self.client.retry(self.client.get_children, *args, **kwargs) except NoNodeError: return [] def take_lock(self, *args, **kwargs): while True: try: self.hosts_cache.touch() return self.client.Lock(*args, **kwargs) except Exception as e: _LOG.error('Failed to obtain lock for exhibitor, retrying', exc_info=e)