def watch_zookeeper_nodes(zookeeper: KazooClient, nodes: Any) -> NoReturn: for node in nodes: watcher = NodeWatcher(node.dest, node.owner, node.group, node.mode) zookeeper.DataWatch(node.source, watcher.on_change) # all the interesting stuff is now happening in the Kazoo worker thread # and so we'll just spin and periodically heartbeat to prove we're alive. while True: time.sleep(HEARTBEAT_INTERVAL) # see the comment in baseplate.live_data.zookeeper for explanation of # how reconnects work with the background thread. if zookeeper.connected: for node in nodes: try: logger.debug("Heartbeating %s", node.dest) # this will make FileWatchers re-parse the file on the next # read which is unfortunate but we do it anyway. it's # important to monitor that the file is being updated as # accurately as possible rather than using a separate file # or mechanism as a proxy. for example, the actual output # file could have bogus permissions that would go unnoticed # if the heartbeat still worked independently. os.utime(node.dest, None) except OSError as exc: logger.warning("%s: could not heartbeat: %s", node.dest, exc)
def get_zk(): global _zk if _zk is None: _zk = KazooClient( app.config['ZK_CONNECTION_STRING'], default_acl=( # grants read permissions to anyone. make_acl('world', 'anyone', read=True), # grants all permissions to the creator of the node. make_acl('auth', '', all=True) ) ) _zk.start() _zk.add_auth('digest', jones_credential) _zk.DataWatch('/services', func=ensure_root) return _zk
class ZKClient: def __init__(self, zk_servers, service_name, host): self.zk = KazooClient(zk_servers) self.zk.start() self.service_name = service_name self.serve_path = "/entry/service/{}/node".format(service_name) self.zk.ensure_path(self.serve_path) self.zk.create(self.serve_path + "/server", host.encode(), ephemeral=True, sequence=True) self.config_path = "/entry/config/service/{}".format(self.service_name) self.zk.DataWatch(self.config_path, self.read_config) def read_config(self, *args): self.zk.ensure_path("/entry/config/service") if not self.zk.exists(self.config_path): self.zk.create(self.config_path, json.dumps({}).encode()) self.config = json.loads(self.zk.get(self.config_path)[0].decode()) def update_config(self, config): self.zk.set(self.config_path, json.dumps(config).encode())
class ZooKeeper(object): ''' Class implementing the ZooKeeper interface. This class uses the facade design pattern to keep common interaction with the ZooKeeper API simple and consistent for the caller, and limits coupling between objects. It allows for more complex interactions by providing direct access to the client connection when needed (though that is discouraged). It also provides for a convenient entry point for testing only ZooKeeper interactions. ''' log = logging.getLogger("zuul.zk.ZooKeeper") REQUEST_ROOT = '/nodepool/requests' NODE_ROOT = '/nodepool/nodes' # Log zookeeper retry every 10 seconds retry_log_rate = 10 def __init__(self): ''' Initialize the ZooKeeper object. ''' self.client = None self._became_lost = False self._last_retry_log = 0 def _dictToStr(self, data): return json.dumps(data).encode('utf8') def _strToDict(self, data): return json.loads(data.decode('utf8')) def _connection_listener(self, state): ''' Listener method for Kazoo connection state changes. .. warning:: This method must not block. ''' if state == KazooState.LOST: self.log.debug("ZooKeeper connection: LOST") self._became_lost = True elif state == KazooState.SUSPENDED: self.log.debug("ZooKeeper connection: SUSPENDED") else: self.log.debug("ZooKeeper connection: CONNECTED") @property def connected(self): return self.client.state == KazooState.CONNECTED @property def suspended(self): return self.client.state == KazooState.SUSPENDED @property def lost(self): return self.client.state == KazooState.LOST @property def didLoseConnection(self): return self._became_lost def resetLostFlag(self): self._became_lost = False def logConnectionRetryEvent(self): now = time.monotonic() if now - self._last_retry_log >= self.retry_log_rate: self.log.warning("Retrying zookeeper connection") self._last_retry_log = now def connect(self, hosts, read_only=False, timeout=10.0): ''' Establish a connection with ZooKeeper cluster. Convenience method if a pre-existing ZooKeeper connection is not supplied to the ZooKeeper object at instantiation time. :param str hosts: Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). :param bool read_only: If True, establishes a read-only connection. :param float timeout: The ZooKeeper session timeout, in seconds (default: 10.0). ''' if self.client is None: self.client = KazooClient(hosts=hosts, read_only=read_only, timeout=timeout) self.client.add_listener(self._connection_listener) # Manually retry initial connection attempt while True: try: self.client.start(1) break except KazooTimeoutError: self.logConnectionRetryEvent() def disconnect(self): ''' Close the ZooKeeper cluster connection. You should call this method if you used connect() to establish a cluster connection. ''' if self.client is not None and self.client.connected: self.client.stop() self.client.close() self.client = None def resetHosts(self, hosts): ''' Reset the ZooKeeper cluster connection host list. :param str hosts: Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). ''' if self.client is not None: self.client.set_hosts(hosts=hosts) def submitNodeRequest(self, node_request, watcher): ''' Submit a request for nodes to Nodepool. :param NodeRequest node_request: A NodeRequest with the contents of the request. :param callable watcher: A callable object that will be invoked each time the request is updated. It is called with two arguments: (node_request, deleted) where node_request is the same argument passed to this method, and deleted is a boolean which is True if the node no longer exists (notably, this will happen on disconnection from ZooKeeper). The watcher should return False when further updates are no longer necessary. ''' data = node_request.toDict() data['created_time'] = time.time() path = '%s/%s-' % (self.REQUEST_ROOT, node_request.priority) path = self.client.create(path, self._dictToStr(data), makepath=True, sequence=True, ephemeral=True) reqid = path.split("/")[-1] node_request.id = reqid def callback(data, stat): if data: data = self._strToDict(data) request_nodes = list(node_request.nodeset.getNodes()) for i, nodeid in enumerate(data.get('nodes', [])): node_path = '%s/%s' % (self.NODE_ROOT, nodeid) node_data, node_stat = self.client.get(node_path) node_data = self._strToDict(node_data) request_nodes[i].id = nodeid request_nodes[i].updateFromDict(node_data) node_request.updateFromDict(data) deleted = (data is None) # data *are* none return watcher(node_request, deleted) self.client.DataWatch(path, callback) def deleteNodeRequest(self, node_request): ''' Delete a request for nodes. :param NodeRequest node_request: A NodeRequest with the contents of the request. ''' path = '%s/%s' % (self.REQUEST_ROOT, node_request.id) try: self.client.delete(path) except kze.NoNodeError: pass def nodeRequestExists(self, node_request): ''' See if a NodeRequest exists in ZooKeeper. :param NodeRequest node_request: A NodeRequest to verify. :returns: True if the request exists, False otherwise. ''' path = '%s/%s' % (self.REQUEST_ROOT, node_request.id) if self.client.exists(path): return True return False def storeNode(self, node): '''Store the node. The node is expected to already exist and is updated in its entirety. :param Node node: The node to update. ''' path = '%s/%s' % (self.NODE_ROOT, node.id) self.client.set(path, self._dictToStr(node.toDict())) def lockNode(self, node, blocking=True, timeout=None): ''' Lock a node. This should be called as soon as a request is fulfilled and the lock held for as long as the node is in-use. It can be used by nodepool to detect if Zuul has gone offline and the node should be reclaimed. :param Node node: The node which should be locked. ''' lock_path = '%s/%s/lock' % (self.NODE_ROOT, node.id) try: lock = Lock(self.client, lock_path) have_lock = lock.acquire(blocking, timeout) except kze.LockTimeout: raise LockException("Timeout trying to acquire lock %s" % lock_path) # If we aren't blocking, it's possible we didn't get the lock # because someone else has it. if not have_lock: raise LockException("Did not get lock on %s" % lock_path) node.lock = lock def unlockNode(self, node): ''' Unlock a node. The node must already have been locked. :param Node node: The node which should be unlocked. ''' if node.lock is None: raise LockException("Node %s does not hold a lock" % (node, )) node.lock.release() node.lock = None def heldNodeCount(self, autohold_key): ''' Count the number of nodes being held for the given tenant/project/job. :param set autohold_key: A set with the tenant/project/job names. ''' identifier = " ".join(autohold_key) try: nodes = self.client.get_children(self.NODE_ROOT) except kze.NoNodeError: return 0 count = 0 for nodeid in nodes: node_path = '%s/%s' % (self.NODE_ROOT, nodeid) node_data, node_stat = self.client.get(node_path) if not node_data: self.log.warning("Node ID %s has no data", nodeid) continue node_data = self._strToDict(node_data) if (node_data['state'] == zuul.model.STATE_HOLD and node_data.get('hold_job') == identifier): count += 1 return count
class ZK(object): def __init__(self, cluster, zkservers=None): if not zkservers: zkservers = get_zkservers() self.cluster = cluster self.zkpath = "/beansdb/%s" % cluster self.zk = KazooClient(hosts=zkservers) self.zk.start() self.zk.ensure_path(self.zkpath) def _path_route(self): """store curr version. children store route.yaml`s in sequence, start with routes_ """ return "%s/route" % self.zkpath def _path_gcs(self): return "%s/gc" % self.zkpath def _path_gc_host(self, host): return "%s/%s" % (self._path_gcs(), host) def _path_gc_bucket(self, host, bucket_str): return "%s/%s" % (self._path_gc_host(host), bucket_str) def _path_backup(self): return "%s/backup" % self.zkpath def _path_proxy(self): return "%s/proxy" % self.zkpath def _path_servers(self): return "%s/servers" % self.zkpath def _path_disks(self): return "%s/disks" % self.zkpath def _path_disk(self, host): return "%s/%s" % (self._path_disks(), host) def _path_jobs(self): """store rerouting job children store jobs in pickle """ return "%s/jobs" % self.zkpath def _path_job(self, key): return "%s/%s" % (self._path_jobs(), key) def path_jobs(self): return self._path_jobs() def path_job(self, key): return self._path_job(key) def _path_migrate(self): return "%s/migrate" % self.zkpath def path_migrate_status(self, host): return "%s/%s" % (self._path_migrate(), host) def path_prepared_lock(self): return "%s/preparedlock" % self.zkpath def path_migrate_lock(self): return "%s/migratelock" % self.zkpath def path_prepared_jobs(self): return "%s/prepared_jobs" % self.zkpath def path_prepared_job(self, key): return "%s/%s" % (self.path_prepared_jobs(), key) def path_err_jobs(self): return "%s/error_jobs" % self.zkpath def path_err_job(self, key): return "%s/%s" % (self.path_err_jobs(), key) def reroute_set(self, key): path = self._path_jobs() self.zk.set(path, key) def reroute_get(self): path = self._path_jobs() curr = self.zk.get(path)[0] return curr def reroute_clear(self): self.reroute_set("None") def all_server_set(self, content): path = self._path_servers() self.zk.ensure_path(path) self.zk.set(path, json.dumps(content)) def all_server_get(self): if not self.zk.exists(self._path_servers()): return [] raw = json.loads(self.zk.get(self._path_servers())[0]) return [host.encode('utf-8') for host in raw] def disk_info_set(self, host, content={}): """ store host's disk info """ data, _ = self._ensure_zk_path(self._path_disks()) disk_info = json.loads(data) disk_info[host] = content self.zk.set(self._path_disks(), json.dumps(disk_info)) return disk_info def disk_info_get(self, host): data, _ = self._ensure_zk_path(self._path_disks()) disk_info = json.loads(data) return disk_info.get(host) def _ensure_zk_path(self, path): if not self.zk.exists(path): self.zk.ensure_path(path) self.zk.set(path, json.dumps({})) return self.zk.get(path) def migrate_status_get(self, host): path = self.path_migrate_status(host) status, _ = self.zk.get(path) return status def migrate_status_set(self, host, status): path = self.path_migrate_status(host) self.zk.ensure_path(path) self.zk.set(path, status) def route_set(self, content, commit=False): path = self._path_route() res = self.zk.create(path + '/route_', content, sequence=True) ver = int(res[-10:]) if commit: self.route_version_set(ver) return ver def route_get(self, ver=-1): path = self._path_route() if ver < 0: ver = int(self.zk.get(path)[0]) return self.zk.get(path + "/route_%010d" % ver)[0] def route_version_set(self, ver): path = self._path_route() self.zk.set(path, str(ver)) def route_version_get(self): path = self._path_route() return int(self.zk.get(path)[0]) def route_verison_get_all(self): path = self._path_route() vers = self.zk.get_children(path) return sorted([int(r[-10:]) for r in vers]) def route_verison_get_newest(self): return max(self.route_verison_get_all()) def route_watch(self, func): path = self._path_route() self.zk.DataWatch(path)(func) def gc_get(self): buckets = self.zk.get_children(self._path_gcs()) return dict([(b, self.zk.get(self._path_gc(b))[0]) for b in buckets]) def gc_set(self, buckets, state): """ gc cron set busy and idle migrate cron set block and idle""" assert state in ("busy", "idle", "block") paths = [self._path_gc(bucket) for bucket in buckets] for p in paths: self.zk.ensure_path(p) if state == "block": busy = [p for p in paths if self.zk.get(p) == "busy"] if len(busy) > 0: return busy for p in paths: self.zk.set(p, state) def gc_set_bucket(self, host, bucket, state): assert state in ("busy", "idle", "block") path = self._path_gc_bucket(host, bucket) self.zk.ensure_path(path) if state == "block": busy = path if self.zk.get(path) == "busy" else "" if busy: return busy self.zk.set(path, state) def gc_get_bucket(self, host, bucket): path = self._path_gc_bucket(host, bucket) return self.zk.get(path)[0] def gc_get_status(self, host): path = self._path_gc_host(host) if self.zk.exists(path): return self.zk.get_children(path) def gc_unblock_bucket(self, host, bucket): stats = self.gc_get_bucket(host, bucket) if stats == 'block': self.gc_set_bucket(host, bucket, 'idle') def gc_unblock(self): keys = self.job_list() buckets = set([key.split("_")[-1] for key in keys]) to_unblock = [] stats = self.gc_get() for b, s in stats.items(): if s == 'block' and s not in buckets: to_unblock.append(b) if len(to_unblock): logger.info('unblock gc: %s', to_unblock) self.gc_set(to_unblock, 'idle') def proxies_get(self): data, _ = self.zk.get(self._path_proxy()) return json.loads(data) def proxies_set(self, addrs): path = self._path_proxy() self.zk.set(path, json.dumps(addrs)) def backup_get(self): data, _ = self.zk.get(self._path_backup()) return json.loads(data) def backup_set(self, dic): self.zk.set(self._path_backup(), json.dumps(dic)) def job_get(self, key): return pickle.loads(self.zk.get(self._path_job(key))[0]) def job_delete(self, key): self.zk.delete(self._path_job(key)) def job_set(self, key, job): path = self._path_job(key) self.zk.set(path, pickle.dumps(job)) def job_create(self, key, job): path = self._path_job(key) self.zk.ensure_path(self.path_jobs()) self.zk.create(path, pickle.dumps(job)) def job_exist(self, key): return self.zk.exists(self._path_job(key)) def job_list(self): return self.zk.get_children(self._path_jobs()) def prepared_job_set(self, key, job): path = self.path_prepared_job(key) self.zk.ensure_path(self.path_prepared_jobs()) self.zk.create(path, pickle.dumps(job)) def prepared_job_get(self, key): return pickle.loads(self.zk.get(self.path_prepared_job(key))[0]) def prepared_job_delete(self, key): self.zk.delete(self.path_prepared_job(key)) def prepared_job_exist(self, key): return self.zk.exists(self.path_prepared_job(key)) def err_job_set(self, key, job): path = self.path_err_job(key) self.zk.ensure_path(self.path_err_jobs()) self.zk.create(path, pickle.dumps(job)) def err_job_get(self, key): return pickle.loads(self.zk.get(self.path_err_job(key))[0]) def err_job_delete(self, key): self.zk.delete(self.path_err_job(key)) def err_job_exist(self, key): return self.zk.exists(self.path_err_job(key))
class ZooKeeper(object): # Constants used by the REST API: LIVE_NODES_ZKNODE = '/live_nodes' ALIASES = '/aliases.json' COLLECTION_STATUS = '/collections' CLUSTER_DETAILED_STATE = '/collections/%s/state.json' CLUSTER_STATE = '/clusterstate.json' SHARDS = 'shards' REPLICAS = 'replicas' STATE = 'state' ACTIVE = 'active' LEADER = 'leader' BASE_URL = 'base_url' TRUE = 'true' FALSE = 'false' COLLECTION = 'collection' def __init__(self, zkServerAddress, timeout=15, max_retries=-1, kazoo_client=None): if KazooClient is None: logging.error('ZooKeeper requires the `kazoo` library to be installed') raise RuntimeError self.collections = {} self.liveNodes = {} self.aliases = {} self.state = None if kazoo_client is None: self.zk = KazooClient(zkServerAddress, read_only=True, timeout=timeout, command_retry={'max_tries': max_retries}, connection_retry={'max_tries': max_retries}) else: self.zk = kazoo_client self.zk.start() def connectionListener(state): if state == KazooState.LOST: self.state = state elif state == KazooState.SUSPENDED: self.state = state self.zk.add_listener(connectionListener) def watchClusterDetailedState(data, *args, **kwargs): if not data: LOG.warning("No cluster state available: no collections defined?") else: self.collections.update(json.loads(data.decode('utf-8'))) LOG.info('Updated collections: %s', self.collections) @self.zk.ChildrenWatch(ZooKeeper.COLLECTION_STATUS) def watchClusterState(children): LOG.info("Updated collection: %s", children) for child in children: self.zk.DataWatch(self.CLUSTER_DETAILED_STATE % child, watchClusterDetailedState) @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE) def watchClusterState(data, *args, **kwargs): if not data: LOG.warning("No cluster state available: no collections defined?") else: self.collections = json.loads(data.decode('utf-8')) LOG.info('Updated collections: %s', self.collections) @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE) def watchLiveNodes(children): self.liveNodes = children LOG.info("Updated live nodes: %s", children) @self.zk.DataWatch(ZooKeeper.ALIASES) def watchAliases(data, stat): if data: json_data = json.loads(data.decode('utf-8')) if ZooKeeper.COLLECTION in json_data: self.aliases = json_data[ZooKeeper.COLLECTION] else: LOG.warning('Expected to find %s in alias update %s', ZooKeeper.COLLECTION, json_data.keys()) else: self.aliases = None LOG.info("Updated aliases: %s", self.aliases) def getHosts(self, collname, only_leader=False, seen_aliases=None): if self.aliases and collname in self.aliases: return self.getAliasHosts(collname, only_leader, seen_aliases) hosts = [] if collname not in self.collections: raise SolrError("Unknown collection: %s" % collname) collection = self.collections[collname] shards = collection[ZooKeeper.SHARDS] for shardname in shards.keys(): shard = shards[shardname] if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE: replicas = shard[ZooKeeper.REPLICAS] for replicaname in replicas.keys(): replica = replicas[replicaname] if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE: if not only_leader or (replica.get(ZooKeeper.LEADER, None) == ZooKeeper.TRUE): base_url = replica[ZooKeeper.BASE_URL] if base_url not in hosts: hosts.append(base_url) return hosts def getAliasHosts(self, collname, only_leader, seen_aliases): if seen_aliases: if collname in seen_aliases: LOG.warn("%s in circular alias definition - ignored", collname) return [] else: seen_aliases = [] seen_aliases.append(collname) collections = self.aliases[collname].split(",") hosts = [] for collection in collections: for host in self.getHosts(collection, only_leader, seen_aliases): if host not in hosts: hosts.append(host) return hosts def getRandomURL(self, collname, only_leader=False): hosts = self.getHosts(collname, only_leader=only_leader) if not hosts: raise SolrError('ZooKeeper returned no active shards!') return '%s/%s' % (random.choice(hosts), collname) def getLeaderURL(self, collname): return self.getRandomURL(collname, only_leader=True)
class ServiceRegister(object): def __init__(self, hosts="127.0.0.1:2181", read_only=True, logger=None): """ 服务注册 :param hosts: Zookeeper集群地址列表 :param read_only: 是否只读 :param logger: 日志对象 """ if not logger: import logging logging.basicConfig() self._zk = KazooClient(hosts, read_only=read_only, logger=logger) self._zk.start() def restart(self): self._zk.restart() def retry_get(self, path, watcher=None): """ 重读 :param path: 节点路由 :param watcher: 观察者回调函数 :return: 成功:节点值,版本号;失败:异常信息,异常代码。 """ return self._zk.retry(self.get, path, watcher) def lock(self, path, identifier, timeout=None): """ 分布式锁 :param path: 路由 :param identifier: 锁标识 :param timeout: 超时时间 :return: 锁对象 """ return DLock(self._zk, path, identifier, timeout) def exist(self, path): """ 节点是否存在 :param path: 路由 :return: 存在返回True,不存在返回False。 """ state = self._zk.exists(path) return state is not None def create(self, path, value=""): """ 创建节点 :param path: 节点路由 :param value: 节点值 :return: 节点路由 """ try: res_path = self._zk.create(path, value, makepath=True) except NodeExistsError: return path except NoNodeError as e: return e.message except ZookeeperError as e: return e.message else: return res_path def get(self, path, watcher=None): """ 查节点值 :param path: 节点路由 :param watcher: 观察者回调函数 :return: 成功:节点值,版本号;失败:异常信息,异常代码。 """ try: data, state = self._zk.get(path) self._zk.DataWatch(path, watcher) except NoNodeError as e: return e.message, -2 except ZookeeperError as e: return e.message, -3 else: return data, state.version def get_children(self, path, watcher=None): """ 查子节点列表 :param path: 节点路由 :param watcher: 观察者回调函数 :return: 子节点列表 """ try: data = self._zk.get_children(path) self._zk.DataWatch(path, watcher) except NoNodeError as e: return [], -2 except ZookeeperError as e: return [], -3 else: return data, 0 def set(self, path, value, version=-1): """ 改节点值 :param path: 节点路由 :param value: 节点值 :param version: 成功:版本号;失败:异常信息。 """ try: state = self._zk.set(path, value, version) except BadVersionError as e: return e.message except NoNodeError as e: return e.message except ZookeeperError as e: return e.message else: return state.version
class AnalyticsDiscovery(gevent.Greenlet): def _sandesh_connection_info_update(self, status, message): new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER, name=self._svc_name, status=new_conn_state, message=message, server_addrs=self._zk_server.split(',')) if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' % (message) self._logger.error(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._logger.error(msg) self._conn_state = new_conn_state # end _sandesh_connection_info_update def _zk_listen(self, state): self._logger.error("Analytics Discovery listen %s" % str(state)) if state == KazooState.CONNECTED: self._sandesh_connection_info_update(status='UP', message='') self._logger.error("Analytics Discovery to publish %s" % str(self._pubinfo)) self._reconnect = True elif state == KazooState.LOST: self._logger.error("Analytics Discovery connection LOST") # Lost the session with ZooKeeper Server # Best of option we have is to exit the process and restart all # over again self._sandesh_connection_info_update( status='DOWN', message='Connection to Zookeeper lost') os._exit(2) elif state == KazooState.SUSPENDED: self._logger.error("Analytics Discovery connection SUSPENDED") # Update connection info self._sandesh_connection_info_update( status='INIT', message='Connection to zookeeper lost. Retrying') def _zk_datawatch(self, watcher, child, data, stat, event="unknown"): self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (watcher, child, data, event)) if data: data_dict = json.loads(data) self._wchildren[watcher][child] = OrderedDict( sorted(data_dict.items())) else: if child in self._wchildren[watcher]: del self._wchildren[watcher][child] if self._watchers[watcher]: self._pendingcb.add(watcher) def _zk_watcher(self, watcher, children): self._logger.error("Analytics Discovery Children %s" % children) self._reconnect = True def __init__(self, logger, zkservers, svc_name, inst, watchers={}, zpostfix="", freq=10): gevent.Greenlet.__init__(self) self._svc_name = svc_name self._inst = inst self._zk_server = zkservers # initialize logging and other stuff if logger is None: logging.basicConfig() self._logger = logging else: self._logger = logger self._conn_state = None self._sandesh_connection_info_update(status='INIT', message='') self._zkservers = zkservers self._zk = None self._pubinfo = None self._publock = Semaphore() self._watchers = watchers self._wchildren = {} self._pendingcb = set() self._zpostfix = zpostfix self._basepath = "/analytics-discovery-" + self._zpostfix self._reconnect = None self._freq = freq def publish(self, pubinfo): # This function can be called concurrently by the main AlarmDiscovery # processing loop as well as by clients. # It is NOT re-entrant self._publock.acquire() self._pubinfo = pubinfo if self._conn_state == ConnectionStatus.UP: try: self._logger.error("ensure %s" % (self._basepath + "/" + self._svc_name)) self._logger.error("zk state %s (%s)" % (self._zk.state, self._zk.client_state)) self._zk.ensure_path(self._basepath + "/" + self._svc_name) self._logger.error("check for %s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)) if pubinfo is not None: if self._zk.exists("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)): self._zk.set("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo) else: self._zk.create("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo, ephemeral=True) else: if self._zk.exists("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)): self._logger.error("withdrawing published info!") self._zk.delete("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)) except Exception as ex: template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._sandesh_connection_info_update(status='DOWN', message='') self._reconnect = True else: self._logger.error("Analytics Discovery cannot publish while down") self._publock.release() def _run(self): while True: self._logger.error("Analytics Discovery zk start") self._zk = KazooClient(hosts=self._zkservers) self._zk.add_listener(self._zk_listen) try: self._zk.start() while self._conn_state != ConnectionStatus.UP: gevent.sleep(1) break except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) self._zk.remove_listener(self._zk_listen) try: self._zk.stop() self._zk.close() except Exception as ex: template = "Exception {0} in AnalyticsDiscovery zk stop/close. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s" % \ (messag, traceback.format_exc(), self._svc_name)) finally: self._zk = None gevent.sleep(1) try: # Update connection info self._sandesh_connection_info_update(status='UP', message='') self._reconnect = False # Done connecting to ZooKeeper for wk in self._watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) self._wchildren[wk] = {} self._zk.ChildrenWatch(self._basepath + "/" + wk, partial(self._zk_watcher, wk)) # Trigger the initial publish self._reconnect = True while True: try: if not self._reconnect: pending_list = list(self._pendingcb) self._pendingcb = set() for wk in pending_list: if self._watchers[wk]: self._watchers[wk](\ sorted(self._wchildren[wk].values())) # If a reconnect happens during processing, don't lose it while self._reconnect: self._logger.error("Analytics Discovery %s reconnect" \ % self._svc_name) self._reconnect = False self._pendingcb = set() self.publish(self._pubinfo) for wk in self._watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) children = self._zk.get_children(self._basepath + "/" + wk) old_children = set(self._wchildren[wk].keys()) new_children = set(children) # Remove contents for the children who are gone # (DO NOT remove the watch) for elem in old_children - new_children: del self._wchildren[wk][elem] # Overwrite existing children, or create new ones for elem in new_children: # Create a watch for new children if elem not in self._wchildren[wk]: self._zk.DataWatch(self._basepath + "/" + \ wk + "/" + elem, partial(self._zk_datawatch, wk, elem)) data_str, _ = self._zk.get(\ self._basepath + "/" + wk + "/" + elem) data_dict = json.loads(data_str) self._wchildren[wk][elem] = \ OrderedDict(sorted(data_dict.items())) self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (wk, elem, self._wchildren[wk][elem], "GET")) if self._watchers[wk]: self._watchers[wk](sorted( self._wchildren[wk].values())) gevent.sleep(self._freq) except gevent.GreenletExit: self._logger.error("Exiting AnalyticsDiscovery for %s" % \ self._svc_name) self._zk.remove_listener(self._zk_listen) gevent.sleep(1) try: self._zk.stop() except: self._logger.error("Stopping kazooclient failed") else: self._logger.error("Stopping kazooclient successful") try: self._zk.close() except: self._logger.error("Closing kazooclient failed") else: self._logger.error("Closing kazooclient successful") break except Exception as ex: template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._reconnect = True except Exception as ex: template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) raise SystemExit
class ZooKeeper(object): ''' Class implementing the ZooKeeper interface. This class uses the facade design pattern to keep common interaction with the ZooKeeper API simple and consistent for the caller, and limits coupling between objects. It allows for more complex interactions by providing direct access to the client connection when needed (though that is discouraged). It also provides for a convenient entry point for testing only ZooKeeper interactions. ''' log = logging.getLogger("zuul.zk.ZooKeeper") REQUEST_ROOT = '/nodepool/requests' REQUEST_LOCK_ROOT = "/nodepool/requests-lock" NODE_ROOT = '/nodepool/nodes' HOLD_REQUEST_ROOT = '/zuul/hold-requests' # Log zookeeper retry every 10 seconds retry_log_rate = 10 def __init__(self, enable_cache=True): ''' Initialize the ZooKeeper object. :param bool enable_cache: When True, enables caching of ZooKeeper objects (e.g., HoldRequests). ''' self.client = None self._became_lost = False self._last_retry_log = 0 self.enable_cache = enable_cache # The caching model we use is designed around handing out model # data as objects. To do this, we use two caches: one is a TreeCache # which contains raw znode data (among other details), and one for # storing that data serialized as objects. This allows us to return # objects from the APIs, and avoids calling the methods to serialize # the data into objects more than once. self._hold_request_tree = None self._cached_hold_requests = {} def _dictToStr(self, data): return json.dumps(data).encode('utf8') def _strToDict(self, data): return json.loads(data.decode('utf8')) def _connection_listener(self, state): ''' Listener method for Kazoo connection state changes. .. warning:: This method must not block. ''' if state == KazooState.LOST: self.log.debug("ZooKeeper connection: LOST") self._became_lost = True elif state == KazooState.SUSPENDED: self.log.debug("ZooKeeper connection: SUSPENDED") else: self.log.debug("ZooKeeper connection: CONNECTED") @property def connected(self): return self.client.state == KazooState.CONNECTED @property def suspended(self): return self.client.state == KazooState.SUSPENDED @property def lost(self): return self.client.state == KazooState.LOST @property def didLoseConnection(self): return self._became_lost def resetLostFlag(self): self._became_lost = False def logConnectionRetryEvent(self): now = time.monotonic() if now - self._last_retry_log >= self.retry_log_rate: self.log.warning("Retrying zookeeper connection") self._last_retry_log = now def connect(self, hosts, read_only=False, timeout=10.0): ''' Establish a connection with ZooKeeper cluster. Convenience method if a pre-existing ZooKeeper connection is not supplied to the ZooKeeper object at instantiation time. :param str hosts: Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). :param bool read_only: If True, establishes a read-only connection. :param float timeout: The ZooKeeper session timeout, in seconds (default: 10.0). ''' if self.client is None: self.client = KazooClient(hosts=hosts, read_only=read_only, timeout=timeout) self.client.add_listener(self._connection_listener) # Manually retry initial connection attempt while True: try: self.client.start(1) break except KazooTimeoutError: self.logConnectionRetryEvent() if self.enable_cache: self._hold_request_tree = TreeCache(self.client, self.HOLD_REQUEST_ROOT) self._hold_request_tree.listen_fault(self.cacheFaultListener) self._hold_request_tree.listen(self.holdRequestCacheListener) self._hold_request_tree.start() def cacheFaultListener(self, e): self.log.exception(e) def holdRequestCacheListener(self, event): ''' Keep the hold request object cache in sync with the TreeCache. ''' try: self._holdRequestCacheListener(event) except Exception: self.log.exception( "Exception in hold request cache update for event: %s", event) def _holdRequestCacheListener(self, event): if hasattr(event.event_data, 'path'): # Ignore root node path = event.event_data.path if path == self.HOLD_REQUEST_ROOT: return if event.event_type not in (TreeEvent.NODE_ADDED, TreeEvent.NODE_UPDATED, TreeEvent.NODE_REMOVED): return path = event.event_data.path request_id = path.rsplit('/', 1)[1] if event.event_type in (TreeEvent.NODE_ADDED, TreeEvent.NODE_UPDATED): # Requests with no data are invalid if not event.event_data.data: return # Perform an in-place update of the already cached request d = self._bytesToDict(event.event_data.data) old_request = self._cached_hold_requests.get(request_id) if old_request: if event.event_data.stat.version <= old_request.stat.version: # Don't update to older data return old_request.updateFromDict(d) old_request.stat = event.event_data.stat else: request = zuul.model.HoldRequest.fromDict(d) request.id = request_id request.stat = event.event_data.stat self._cached_hold_requests[request_id] = request elif event.event_type == TreeEvent.NODE_REMOVED: try: del self._cached_hold_requests[request_id] except KeyError: pass def disconnect(self): ''' Close the ZooKeeper cluster connection. You should call this method if you used connect() to establish a cluster connection. ''' if self._hold_request_tree is not None: self._hold_request_tree.close() self._hold_request_tree = None if self.client is not None and self.client.connected: self.client.stop() self.client.close() self.client = None def resetHosts(self, hosts): ''' Reset the ZooKeeper cluster connection host list. :param str hosts: Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). ''' if self.client is not None: self.client.set_hosts(hosts=hosts) def submitNodeRequest(self, node_request, watcher): ''' Submit a request for nodes to Nodepool. :param NodeRequest node_request: A NodeRequest with the contents of the request. :param callable watcher: A callable object that will be invoked each time the request is updated. It is called with two arguments: (node_request, deleted) where node_request is the same argument passed to this method, and deleted is a boolean which is True if the node no longer exists (notably, this will happen on disconnection from ZooKeeper). The watcher should return False when further updates are no longer necessary. ''' node_request.created_time = time.time() data = node_request.toDict() path = '{}/{:0>3}-'.format(self.REQUEST_ROOT, node_request.priority) path = self.client.create(path, self._dictToStr(data), makepath=True, sequence=True, ephemeral=True) reqid = path.split("/")[-1] node_request.id = reqid def callback(data, stat): if data: self.updateNodeRequest(node_request, data) deleted = (data is None) # data *are* none return watcher(node_request, deleted) self.client.DataWatch(path, callback) def deleteNodeRequest(self, node_request): ''' Delete a request for nodes. :param NodeRequest node_request: A NodeRequest with the contents of the request. ''' path = '%s/%s' % (self.REQUEST_ROOT, node_request.id) try: self.client.delete(path) except kze.NoNodeError: pass def nodeRequestExists(self, node_request): ''' See if a NodeRequest exists in ZooKeeper. :param NodeRequest node_request: A NodeRequest to verify. :returns: True if the request exists, False otherwise. ''' path = '%s/%s' % (self.REQUEST_ROOT, node_request.id) if self.client.exists(path): return True return False def storeNodeRequest(self, node_request): '''Store the node request. The request is expected to already exist and is updated in its entirety. :param NodeRequest node_request: The request to update. ''' path = '%s/%s' % (self.REQUEST_ROOT, node_request.id) self.client.set(path, self._dictToStr(node_request.toDict())) def updateNodeRequest(self, node_request, data=None): '''Refresh an existing node request. :param NodeRequest node_request: The request to update. :param dict data: The data to use; query ZK if absent. ''' if data is None: path = '%s/%s' % (self.REQUEST_ROOT, node_request.id) data, stat = self.client.get(path) data = self._strToDict(data) request_nodes = list(node_request.nodeset.getNodes()) for i, nodeid in enumerate(data.get('nodes', [])): request_nodes[i].id = nodeid self.updateNode(request_nodes[i]) node_request.updateFromDict(data) def storeNode(self, node): '''Store the node. The node is expected to already exist and is updated in its entirety. :param Node node: The node to update. ''' path = '%s/%s' % (self.NODE_ROOT, node.id) self.client.set(path, self._dictToStr(node.toDict())) def updateNode(self, node): '''Refresh an existing node. :param Node node: The node to update. ''' node_path = '%s/%s' % (self.NODE_ROOT, node.id) node_data, node_stat = self.client.get(node_path) node_data = self._strToDict(node_data) node.updateFromDict(node_data) def lockNode(self, node, blocking=True, timeout=None): ''' Lock a node. This should be called as soon as a request is fulfilled and the lock held for as long as the node is in-use. It can be used by nodepool to detect if Zuul has gone offline and the node should be reclaimed. :param Node node: The node which should be locked. ''' lock_path = '%s/%s/lock' % (self.NODE_ROOT, node.id) try: lock = Lock(self.client, lock_path) have_lock = lock.acquire(blocking, timeout) except kze.LockTimeout: raise LockException("Timeout trying to acquire lock %s" % lock_path) # If we aren't blocking, it's possible we didn't get the lock # because someone else has it. if not have_lock: raise LockException("Did not get lock on %s" % lock_path) node.lock = lock def unlockNode(self, node): ''' Unlock a node. The node must already have been locked. :param Node node: The node which should be unlocked. ''' if node.lock is None: raise LockException("Node %s does not hold a lock" % (node, )) node.lock.release() node.lock = None def lockNodeRequest(self, request, blocking=True, timeout=None): ''' Lock a node request. This will set the `lock` attribute of the request object when the lock is successfully acquired. :param NodeRequest request: The request to lock. :param bool blocking: Whether or not to block on trying to acquire the lock :param int timeout: When blocking, how long to wait for the lock to get acquired. None, the default, waits forever. :raises: TimeoutException if we failed to acquire the lock when blocking with a timeout. ZKLockException if we are not blocking and could not get the lock, or a lock is already held. ''' path = "%s/%s" % (self.REQUEST_LOCK_ROOT, request.id) try: lock = Lock(self.client, path) have_lock = lock.acquire(blocking, timeout) except kze.LockTimeout: raise LockException("Timeout trying to acquire lock %s" % path) except kze.NoNodeError: have_lock = False self.log.error("Request not found for locking: %s", request) # If we aren't blocking, it's possible we didn't get the lock # because someone else has it. if not have_lock: raise LockException("Did not get lock on %s" % path) request.lock = lock self.updateNodeRequest(request) def unlockNodeRequest(self, request): ''' Unlock a node request. The request must already have been locked. :param NodeRequest request: The request to unlock. :raises: ZKLockException if the request is not currently locked. ''' if request.lock is None: raise LockException("Request %s does not hold a lock" % request) request.lock.release() request.lock = None def heldNodeCount(self, autohold_key): ''' Count the number of nodes being held for the given tenant/project/job. :param set autohold_key: A set with the tenant/project/job names. ''' identifier = " ".join(autohold_key) try: nodes = self.client.get_children(self.NODE_ROOT) except kze.NoNodeError: return 0 count = 0 for nodeid in nodes: node_path = '%s/%s' % (self.NODE_ROOT, nodeid) try: node_data, node_stat = self.client.get(node_path) except kze.NoNodeError: # Node got removed on us. Just ignore. continue if not node_data: self.log.warning("Node ID %s has no data", nodeid) continue node_data = self._strToDict(node_data) if (node_data['state'] == zuul.model.STATE_HOLD and node_data.get('hold_job') == identifier): count += 1 return count # Copy of nodepool/zk.py begins here NODE_ROOT = "/nodepool/nodes" LAUNCHER_ROOT = "/nodepool/launchers" def _bytesToDict(self, data): return json.loads(data.decode('utf8')) def _launcherPath(self, launcher): return "%s/%s" % (self.LAUNCHER_ROOT, launcher) def _nodePath(self, node): return "%s/%s" % (self.NODE_ROOT, node) def getRegisteredLaunchers(self): ''' Get a list of all launchers that have registered with ZooKeeper. :returns: A list of Launcher objects, or empty list if none are found. ''' try: launcher_ids = self.client.get_children(self.LAUNCHER_ROOT) except kze.NoNodeError: return [] objs = [] for launcher in launcher_ids: path = self._launcherPath(launcher) try: data, _ = self.client.get(path) except kze.NoNodeError: # launcher disappeared continue objs.append(Launcher.fromDict(self._bytesToDict(data))) return objs def getNodes(self): ''' Get the current list of all nodes. :returns: A list of nodes. ''' try: return self.client.get_children(self.NODE_ROOT) except kze.NoNodeError: return [] def getNode(self, node): ''' Get the data for a specific node. :param str node: The node ID. :returns: The node data, or None if the node was not found. ''' path = self._nodePath(node) try: data, stat = self.client.get(path) except kze.NoNodeError: return None if not data: return None d = self._bytesToDict(data) d['id'] = node return d def nodeIterator(self): ''' Utility generator method for iterating through all nodes. ''' for node_id in self.getNodes(): node = self.getNode(node_id) if node: yield node def getHoldRequests(self): ''' Get the current list of all hold requests. ''' try: return sorted(self.client.get_children(self.HOLD_REQUEST_ROOT)) except kze.NoNodeError: return [] def getHoldRequest(self, hold_request_id): path = self.HOLD_REQUEST_ROOT + "/" + hold_request_id try: data, stat = self.client.get(path) except kze.NoNodeError: return None if not data: return None obj = zuul.model.HoldRequest.fromDict(self._strToDict(data)) obj.id = hold_request_id obj.stat = stat return obj def storeHoldRequest(self, hold_request): ''' Create or update a hold request. If this is a new request with no value for the `id` attribute of the passed in request, then `id` will be set with the unique request identifier after successful creation. :param HoldRequest hold_request: Object representing the hold request. ''' if hold_request.id is None: path = self.client.create(self.HOLD_REQUEST_ROOT + "/", value=hold_request.serialize(), sequence=True, makepath=True) hold_request.id = path.split('/')[-1] else: path = self.HOLD_REQUEST_ROOT + "/" + hold_request.id self.client.set(path, hold_request.serialize()) def _markHeldNodesAsUsed(self, hold_request): ''' Changes the state for each held node for the hold request to 'used'. :returns: True if all nodes marked USED, False otherwise. ''' def getHeldNodeIDs(request): node_ids = [] for data in request.nodes: # TODO(Shrews): Remove type check at some point. # When autoholds were initially changed to be stored in ZK, # the node IDs were originally stored as a list of strings. # A later change embedded them within a dict. Handle both # cases here to deal with the upgrade. if isinstance(data, dict): node_ids += data['nodes'] else: node_ids.append(data) return node_ids failure = False for node_id in getHeldNodeIDs(hold_request): node = self.getNode(node_id) if not node or node['state'] == zuul.model.STATE_USED: continue node['state'] = zuul.model.STATE_USED name = None label = None if 'name' in node: name = node['name'] if 'label' in node: label = node['label'] node_obj = zuul.model.Node(name, label) node_obj.updateFromDict(node) try: self.lockNode(node_obj, blocking=False) self.storeNode(node_obj) except Exception: self.log.exception( "Cannot change HELD node state to USED " "for node %s in request %s", node_obj.id, hold_request.id) failure = True finally: try: if node_obj.lock: self.unlockNode(node_obj) except Exception: self.log.exception( "Failed to unlock HELD node %s for request %s", node_obj.id, hold_request.id) return not failure def deleteHoldRequest(self, hold_request): ''' Delete a hold request. :param HoldRequest hold_request: Object representing the hold request. ''' if not self._markHeldNodesAsUsed(hold_request): self.log.info( "Unable to delete hold request %s because " "not all nodes marked as USED.", hold_request.id) return path = self.HOLD_REQUEST_ROOT + "/" + hold_request.id try: self.client.delete(path, recursive=True) except kze.NoNodeError: pass def lockHoldRequest(self, request, blocking=True, timeout=None): ''' Lock a node request. This will set the `lock` attribute of the request object when the lock is successfully acquired. :param HoldRequest request: The hold request to lock. ''' if not request.id: raise LockException( "Hold request without an ID cannot be locked: %s" % request) path = "%s/%s/lock" % (self.HOLD_REQUEST_ROOT, request.id) try: lock = Lock(self.client, path) have_lock = lock.acquire(blocking, timeout) except kze.LockTimeout: raise LockException("Timeout trying to acquire lock %s" % path) # If we aren't blocking, it's possible we didn't get the lock # because someone else has it. if not have_lock: raise LockException("Did not get lock on %s" % path) request.lock = lock def unlockHoldRequest(self, request): ''' Unlock a hold request. The request must already have been locked. :param HoldRequest request: The request to unlock. :raises: ZKLockException if the request is not currently locked. ''' if request.lock is None: raise LockException("Request %s does not hold a lock" % request) request.lock.release() request.lock = None
class ZookeeperStorage: """A low level storage object. Manages and publishes the zookeeper connection. Manages the database "schema" and allows access to multiple "groups" database servers, each representing one logical cluster. """ _zk = None def __init__(self, connection_string, path, timeout=10.0): self._connection_string = connection_string self._path_prefix = path self._timeout = timeout if not self._path_prefix.endswith('/'): self._path_prefix += '/' self._watchers = {} self._loop = asyncio.get_event_loop() @property def connection(self): if self._zk is None: self._zk = KazooClient( hosts=self._connection_string, timeout=self._timeout) return self._zk def dcs_connect(self): self.connection.start() def dcs_disconnect(self): self._zk.stop() self._zk = None def _dict_watcher(self, group, what, callback): def hook(state, key, from_val, to_val): callback(_get_clusters(state)) path = self._folder_path(what) prefix = group and group + '-' or group try: watch = DictWatch(self._zk, path, hook, prefix=prefix) except kazoo.exceptions.NoNodeError: self._zk.create(path, makepath=True) return self._dict_watcher(group, what, callback) self._watchers[id(watch)] = watch return watch def _listen_connection(self, state): self._connection_state_changes.append(state) self._loop.call_soon_threadsafe(self._consume_connection_state_changes) def dcs_watch_conn_info(self, callback, group=None): self._dict_watcher(group, 'conn', callback) def dcs_watch_state(self, callback, group=None): self._dict_watcher(group, 'state', callback) def _folder_path(self, folder): return self._path_prefix + folder def _path(self, group, folder, key): return self._path_prefix + folder + '/' + group + '-' + key def _get_static(self, group, key): path = self._path(group, 'static', key) try: data, stat = self._zk.get(path) except kazoo.exceptions.NoNodeError: return None return data def _set_static(self, group, key, data, overwrite=False): path = self._path(group, 'static', key) try: self._zk.create(path, data, makepath=True) except kazoo.exceptions.NodeExistsError: if overwrite: self._zk.set(path, data) return True return False return True def dcs_get_timeline(self, group): data = self._get_static(group, 'timeline') if data is None: data = b'0' return int(data.decode('ascii')) def dcs_set_timeline(self, group, timeline): assert isinstance(timeline, int) existing = self.dcs_get_timeline(group) if existing > timeline: raise ValueError('Timelines can only increase.') timeline = str(timeline).encode('ascii') self._set_static(group, 'timeline', timeline, overwrite=True) def dcs_set_database_identifier(self, group, database_id): database_id = database_id.encode('ascii') return self._set_static(group, 'database_identifier', database_id) def dcs_get_database_identifier(self, group): data = self._get_static(group, 'database_identifier') if data is not None: data = data.decode('ascii') return data def dcs_get_lock_owner(self, group, name): path = self._path(group, 'lock', name) try: existing_data, stat = self._zk.get(path) except kazoo.exceptions.NoNodeError: return None return existing_data.decode('utf-8') def dcs_unlock(self, group, name, owner): existing_owner = self.dcs_get_lock_owner(group, name) if existing_owner == owner: path = self._path(group, 'lock', name) self._zk.delete(path) def dcs_lock(self, group, name, owner): data = owner.encode('utf-8') path = self._path(group, 'lock', name) try: self._zk.create(path, data, ephemeral=True, makepath=True) return 'locked' except kazoo.exceptions.NodeExistsError: pass # lock exists, do we have it, can we break it? try: existing_data, stat = self._zk.get(path) except kazoo.exceptions.NoNodeError: # lock broke while we were looking at it # try get it again return self.dcs_lock(group, name, owner) if stat.owner_session_id == self._zk.client_id[0]: # we already own the lock return 'owned' elif data == existing_data: # it is our log, perhaps I am restarting. of there are 2 of me running! try: self._zk.delete(path, version=stat.version) except (kazoo.exceptions.NoNodeError, kazoo.exceptions.BadVersionError): # lock broke while we were looking at it pass # try get the lock again result = self.dcs_lock(group, name, owner) if result == 'locked': return 'broken' return result return 'failed' def dcs_watch_lock(self, name, group, callback): loop = asyncio.get_event_loop() def handler(data, stat, event): if data is not None: data = data.decode('utf-8') callback(data) path = self._path(group, 'lock', name) w = self._zk.DataWatch(path, partial(loop.call_soon_threadsafe, handler)) self._watchers[id(w)] = w def dcs_get_database_identifiers(self): wanted_info_name = 'database_identifier' dirpath = self._folder_path('static') try: children = self._zk.get_children(dirpath) except kazoo.exceptions.NoNodeError: return {} result = {} for name in children: owner, info_name = name.split('-', 1) if wanted_info_name != info_name: continue try: data, state = self._zk.get(dirpath + '/' + name) except kazoo.exceptions.NoNodeError: continue state = json.loads(data.decode('ascii')) result[owner] = state return result def dcs_watch_database_identifiers(self, callback): name = 'database_identifier' def handler(state, key, from_val, to_val): # this is probably more complex than it needs to be! c_state = _get_clusters(state) new_state = {} for k, v in c_state.items(): ours = v.get(name, None) if ours is not None: new_state[k] = ours callback(new_state) dirpath = self._folder_path('static') watch = DictWatch( self._zk, dirpath, handler, deserializer=lambda data: data.decode('utf-8')) self._watchers[id(watch)] = watch def dcs_watch_locks(self, name, callback): def handler(state, key, from_val, to_val): # this is probably more complex than it needs to be! c_state = _get_clusters(state) new_state = {} for k, v in c_state.items(): ours = v.get(name, None) if ours is not None: new_state[k] = ours callback(new_state) dirpath = self._folder_path('lock') watch = DictWatch( self._zk, dirpath, handler, deserializer=lambda data: data.decode('utf-8')) self._watchers[id(watch)] = watch def _set_info(self, group, type, owner, data): path = self._path(group, type, owner) data = json.dumps(data) data = data.encode('ascii') try: stat = self._zk.set(path, data) how = 'existing' except kazoo.exceptions.NoNodeError: how = 'create' stat = None if stat is not None and stat.owner_session_id != self._zk.client_id[0]: self._zk.delete(path) how = 'takeover' stat = None if stat is None: self._zk.create(path, data, ephemeral=True, makepath=True) return how def dcs_set_conn_info(self, group, owner, data): return self._set_info(group, 'conn', owner, data) def dcs_set_state(self, group, owner, data): return self._set_info(group, 'state', owner, data) def _get_all_info(self, group, type): dirpath = self._folder_path(type) try: children = self._zk.get_children(dirpath) except kazoo.exceptions.NoNodeError: return iter([]) for name in children: this_group, owner = name.split('-', 1) if group is not None and this_group != group: continue data, state = self._zk.get(dirpath + '/' + name) state = json.loads(data.decode('ascii')) yield owner, state def dcs_list_conn_info(self, group=None): return list(self._get_all_info(group, 'conn')) def dcs_list_state(self, group=None): return list(self._get_all_info(group, 'state')) def dcs_delete_conn_info(self, group, owner): path = self._path(group, 'conn', owner) try: self._zk.delete(path) except kazoo.exceptions.NoNodeError: pass
class AnalyticsDiscovery(gevent.Greenlet): def _sandesh_connection_info_update(self, status, message): new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER, name=self._svc_name, status=new_conn_state, message=message, server_addrs=self._zk_server.split(',')) if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' % (message) self._logger.error(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._logger.error(msg) self._conn_state = new_conn_state #import pdb; pdb.set_trace() # end _sandesh_connection_info_update def _zk_listen(self, state): self._logger.error("Analytics Discovery listen %s" % str(state)) if state == KazooState.CONNECTED: if self._conn_state != ConnectionStatus.UP: self._sandesh_connection_info_update(status='UP', message='') self._logger.error("Analytics Discovery to publish %s" % str(self._pubinfo)) self._reconnect = True else: self._logger.error("Analytics Discovery already connected") else: self._logger.error("Analytics Discovery NOT connected") if self._conn_state == ConnectionStatus.UP: self._sandesh_connection_info_update(status='DOWN', message='') def _zk_datawatch(self, watcher, child, data, stat, event="unknown"): self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (watcher, child, data, event)) if data: data_dict = json.loads(data) self._wchildren[watcher][child] = OrderedDict( sorted(data_dict.items())) else: if child in self._wchildren[watcher]: del self._wchildren[watcher][child] if self._watchers[watcher]: self._watchers[watcher](sorted(self._wchildren[watcher].values())) def _zk_watcher(self, watcher, children): self._logger.error("Analytics Discovery Children %s" % children) self._reconnect = True def __init__(self, logger, zkservers, svc_name, inst, watchers={}, zpostfix="", freq=10): gevent.Greenlet.__init__(self) self._svc_name = svc_name self._inst = inst self._zk_server = zkservers # initialize logging and other stuff if logger is None: logging.basicConfig() self._logger = logging else: self._logger = logger self._conn_state = None self._sandesh_connection_info_update(status='INIT', message='') self._zk = KazooClient(hosts=zkservers) self._pubinfo = None self._watchers = watchers self._wchildren = {} self._zpostfix = zpostfix self._basepath = "/analytics-discovery-" + self._zpostfix self._reconnect = None self._freq = freq def publish(self, pubinfo): self._pubinfo = pubinfo #import pdb; pdb.set_trace() if self._conn_state == ConnectionStatus.UP: try: self._logger.error("ensure %s" % (self._basepath + "/" + self._svc_name)) self._logger.error("zk state %s (%s)" % (self._zk.state, self._zk.client_state)) self._zk.ensure_path(self._basepath + "/" + self._svc_name) self._logger.error("check for %s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)) if pubinfo is not None: if self._zk.exists("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)): self._zk.set("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo) else: self._zk.create("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo, ephemeral=True) else: if self._zk.exists("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)): self._logger.error("withdrawing published info!") self._zk.delete("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)) except Exception as ex: template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._sandesh_connection_info_update(status='DOWN', message='') self._reconnect = True else: self._logger.error("Analytics Discovery cannot publish while down") def _run(self): while True: try: self._zk.start() break except gevent.event.Timeout as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) # Zookeeper is also throwing exception due to delay in master election except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) try: # Update connection info self._sandesh_connection_info_update(status='UP', message='') self._reconnect = False # Done connecting to ZooKeeper self._zk.add_listener(self._zk_listen) for wk in self._watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) self._wchildren[wk] = {} self._zk.ChildrenWatch(self._basepath + "/" + wk, partial(self._zk_watcher, wk)) # Trigger the initial publish self._reconnect = True while True: try: # If a reconnect happens during processing, don't lose it while self._reconnect: self._logger.error("Analytics Discovery %s reconnect" \ % self._svc_name) self._reconnect = False self.publish(self._pubinfo) for wk in self._watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) children = self._zk.get_children(self._basepath + "/" + wk) old_children = set(self._wchildren[wk].keys()) new_children = set(children) # Remove contents for the children who are gone # (DO NOT remove the watch) for elem in old_children - new_children: del self._wchildren[wk][elem] # Overwrite existing children, or create new ones for elem in new_children: # Create a watch for new children if elem not in self._wchildren[wk]: self._zk.DataWatch(self._basepath + "/" + \ wk + "/" + elem, partial(self._zk_datawatch, wk, elem)) data_str, _ = self._zk.get(\ self._basepath + "/" + wk + "/" + elem) data_dict = json.loads(data_str) self._wchildren[wk][elem] = \ OrderedDict(sorted(data_dict.items())) self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (wk, elem, self._wchildren[wk][elem], "GET")) if self._watchers[wk]: self._watchers[wk](sorted( self._wchildren[wk].values())) gevent.sleep(self._freq) except gevent.GreenletExit: self._logger.error("Exiting AnalyticsDiscovery for %s" % \ self._svc_name) self._zk.stop() break except Exception as ex: template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._reconnect = True except Exception as ex: template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) raise SystemExit
class ZooKeeper(object): # Constants used by the REST API: LIVE_NODES_ZKNODE = "/live_nodes" ALIASES = "/aliases.json" CLUSTER_STATE = "/clusterstate.json" COLLECTION_STATUS = "/collections" COLLECTION_STATE = "/collections/%s/state.json" SHARDS = "shards" REPLICAS = "replicas" STATE = "state" ACTIVE = "active" LEADER = "leader" BASE_URL = "base_url" TRUE = "true" FALSE = "false" COLLECTION = "collection" def __init__(self, zkServerAddress, timeout=15, max_retries=-1, kazoo_client=None): if KazooClient is None: logging.error( "ZooKeeper requires the `kazoo` library to be installed") raise RuntimeError self.collections = {} self.liveNodes = {} self.aliases = {} self.state = None if kazoo_client is None: self.zk = KazooClient( zkServerAddress, read_only=True, timeout=timeout, command_retry={"max_tries": max_retries}, connection_retry={"max_tries": max_retries}, ) else: self.zk = kazoo_client self.zk.start() def connectionListener(state): if state == KazooState.LOST: self.state = state elif state == KazooState.SUSPENDED: self.state = state self.zk.add_listener(connectionListener) @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE) def watchClusterState(data, *args, **kwargs): if not data: logger.warning( "No cluster state available: no collections defined?") else: self.collections = json.loads(data.decode("utf-8")) logger.info("Updated collections: %s", self.collections) @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE) def watchLiveNodes(children): self.liveNodes = children logger.info("Updated live nodes: %s", children) @self.zk.DataWatch(ZooKeeper.ALIASES) def watchAliases(data, stat): if data: json_data = json.loads(data.decode("utf-8")) if ZooKeeper.COLLECTION in json_data: self.aliases = json_data[ZooKeeper.COLLECTION] else: logger.warning( "Expected to find %s in alias update %s", ZooKeeper.COLLECTION, json_data.keys(), ) else: self.aliases = None logger.info("Updated aliases: %s", self.aliases) def watchCollectionState(data, *args, **kwargs): if not data: logger.warning( "No cluster state available: no collections defined?") else: self.collections.update(json.loads(data.decode("utf-8"))) logger.info("Updated collections: %s", self.collections) @self.zk.ChildrenWatch(ZooKeeper.COLLECTION_STATUS) def watchCollectionStatus(children): logger.info("Updated collection: %s", children) for c in children: self.zk.DataWatch(self.COLLECTION_STATE % c, watchCollectionState) def getHosts(self, collname, only_leader=False, seen_aliases=None): if self.aliases and collname in self.aliases: return self.getAliasHosts(collname, only_leader, seen_aliases) hosts = [] if collname not in self.collections: raise SolrError("Unknown collection: %s" % collname) collection = self.collections[collname] shards = collection[ZooKeeper.SHARDS] for shardname in shards.keys(): shard = shards[shardname] if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE: replicas = shard[ZooKeeper.REPLICAS] for replicaname in replicas.keys(): replica = replicas[replicaname] if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE: if not only_leader or (replica.get( ZooKeeper.LEADER, None) == ZooKeeper.TRUE): base_url = replica[ZooKeeper.BASE_URL] if base_url not in hosts: hosts.append(base_url) return hosts def getAliasHosts(self, collname, only_leader, seen_aliases): if seen_aliases: if collname in seen_aliases: logger.warning("%s in circular alias definition - ignored", collname) return [] else: seen_aliases = [] seen_aliases.append(collname) collections = self.aliases[collname].split(",") hosts = [] for collection in collections: for host in self.getHosts(collection, only_leader, seen_aliases): if host not in hosts: hosts.append(host) return hosts def getRandomURL(self, collname, only_leader=False): hosts = self.getHosts(collname, only_leader=only_leader) if not hosts: raise SolrError("ZooKeeper returned no active shards!") return "%s/%s" % (random.choice(hosts), collname) # NOQA: B311 def getLeaderURL(self, collname): return self.getRandomURL(collname, only_leader=True)
class watcher: def __init__(self, hostName, displayListSize): self.host = hostName self.maxDisplaySize = displayListSize self.zk = KazooClient(hosts=self.host) self.zk.start() self.zk.DataWatch("/gameData/recentScores", self.displayScores) self.zk.DataWatch("/gameData/activeUsers", self.displayScores) atexit.register(self.cleanup) def displayScores(self, var1, var2): self.displayRecentScores() self.displayHighestScores() def activeUserList(self): self.zk.ensure_path(path="/gameData") if not self.zk.exists(path="/gameData/activeUsers"): self.zk.create(path="/gameData/activeUsers", value=pickle.dumps([])) activeUsersZkObj, _ = self.zk.get(path="/gameData/activeUsers") activeUsersObj = pickle.loads(activeUsersZkObj) return activeUsersObj def displayRecentScores(self): self.zk.ensure_path(path="/gameData") if not self.zk.exists(path="/gameData/recentScores"): self.zk.create(path="/gameData/recentScores", value=pickle.dumps([])) activeUsersZkObj, _ = self.zk.get(path="/gameData/recentScores") activeUsersObj = pickle.loads(activeUsersZkObj) print "\nMost recent scores" print "------------------" activeUsersList = self.activeUserList() if len(activeUsersObj) > self.maxDisplaySize: activeUsersObj = activeUsersObj[len(activeUsersObj) - self. maxDisplaySize:len(activeUsersObj)] for recentScore in activeUsersObj: outputStr = "%s \t\t %d" % (recentScore[0], recentScore[1]) if recentScore[0] in activeUsersList: outputStr += " **" print outputStr def displayHighestScores(self): self.zk.ensure_path(path="/gameData") if not self.zk.exists(path="/gameData/maxScores"): self.zk.create(path="/gameData/maxScores", value=pickle.dumps([])) activeUsersZkObj, _ = self.zk.get(path="/gameData/maxScores") activeUsersObj = pickle.loads(activeUsersZkObj) print "\nHighest scores" print "------------------" activeUsersList = self.activeUserList() if len(activeUsersObj) > self.maxDisplaySize: activeUsersObj = activeUsersObj[0:self.maxDisplaySize] for highestScore in activeUsersObj: outputStr = "%s \t %d" % (highestScore[0], highestScore[1]) if highestScore[0] in activeUsersList: outputStr += " **" print outputStr def cleanup(self): self.zk.stop()
class Server(threading.Thread): ''' 工作服务器(也是ZooKeeper的客户端) ''' # 控制输出信息的锁,注意:这个是单机器的锁,这里实现的是分布式锁,并不存在本末倒置 print_mutex = threading.Lock() DELAY_TIME = 3 def __init__(self, zk_server_address, lock_base_path, host, serve_mode): threading.Thread.__init__(self) # 锁的根节点路径 self.lock_base_path = lock_base_path # 主机IP self.host = host # 工作模式,读/写 self.serve_mode = serve_mode # 事件,初始化为False self.event = threading.Event() # 创建一个zookeeper客户端 self.zkclient = KazooClient(zk_server_address) # 添加连接状态监听器 self.zkclient.add_listener(self.zk_connect_listener) # 与zookeeper开启连接 self.zkclient.start() # 连接状态监听器 def zk_connect_listener(self, state): # 获取打印锁 Server.print_mutex.acquire() if state == KeeperState.CONNECTED: print self.host + " 已经开启..." elif state == KazooState.LOST: print self.host + " 停止服务..." else: raise Exception(self.host + " 未正常开启...") # 获取打印锁 Server.print_mutex.release() # 初始化 def run(self): # 创建锁节点,形如/shared_lock/192.168.0.0-R-0000000001 self.create_lock_node() # 获取锁 self.acquire_lock() # 工作 self.work() # 释放锁 self.release_lock() # 准备停止 self.stop() def create_lock_node(self): # 先检查父节点,如果父节点不存在 if not self.zkclient.exists(self.lock_base_path): # 先创建父节点 self.zkclient.create(self.lock_base_path) # 拼凑出服务器子节点的完整路径 node_path = self.lock_base_path + "/" + self.host + "-" + self.serve_mode + "-" # 创建临时顺序节点 self.node_path = self.zkclient.create(node_path, "", self.zkclient.default_acl, True, True) # 删除事件的响应 def pre_node_delete_watch(self, data, stat, event): if event and event.type == EventType.DELETED: # 将事件设置为True self.event.set() # 获取锁 def acquire_lock(self): # 提取出自己的节点名 node_name = self.node_path.split("/")[-1] # 获取/shared_lock子节点排序列表 sorted_children = self.get_sorted_children() # 得到节点的索引 node_index = sorted_children.index(node_name) # 寻找最后一个写节点 def get_last_write_node_index(): # 逆向遍历 for i in range(node_index)[::-1]: # 工作模式是节点名中的第二个部分 serve_mode = sorted_children[i].split("-")[1] # 只要找到一个写请求,则立刻返回 if serve_mode == "W": return i # 如果全部都是读请求,则返回-1 return -1 # 如果是写请求, if self.serve_mode == "W": # 如果是,再判断自己是不是序号最小的节点 if node_index == 0: # 立马返回,占用锁,开始写数据 return # 如果不是,向比自己小的最后一个节点注册监听 else: # 拼凑出前一个节点的路径 pre_node_path = self.lock_base_path + "/" + sorted_children[node_index - 1] # 添加对前一个节点的删除事件的关注 self.zkclient.DataWatch(pre_node_path, self.pre_node_delete_watch) # 这里应该等待锁 self.event.wait() # 如果是读请求 else: # 得到所有比自己小的子节点中的最后一个写节点的下标 last_write_node_index = get_last_write_node_index() # 判断以下两个条件是否成立 # 1)没有比自己序号小的子节点 # 2)或是所有比自己小的子节点都是读请求 # 如果成立 if node_index == 0 or last_write_node_index < 0: # 立马返回,占用共享锁,开始读数据 return # 如果不成立,向比自己小的最后一个写节点注册监听 else: # 拼凑出前一个节点的路径 pre_node_path = self.lock_base_path + "/" + sorted_children[last_write_node_index] # 添加对前一个节点的删除事件的关注 self.zkclient.DataWatch(pre_node_path, self.pre_node_delete_watch) # 这里应该等待锁 self.event.wait() def work(self): # 获取打印锁 Server.print_mutex.acquire() # 如果是写请求, if self.serve_mode == "W": # 写一会数据,然后删除节点,关闭会话 print self.host + " 正在写数据..." else: # 读一会数据,然后删除节点,关闭会话 print self.host + " 正在读数据..." Server.print_mutex.release() # 这里暂停几秒钟。模拟工作耗时状态 sleep(self.DELAY_TIME) # 释放锁 def release_lock(self): # 删除自己的节点 self.zkclient.delete(self.node_path) # 获取/shared_lock子节点排序列表 def get_sorted_children(self): # 获取/shared_lock子节点列表 children = self.zkclient.get_children(self.lock_base_path) ############################################################### # 这里sort函数的比较表达式是由两个函数实现,还挺有技巧的 ############################################################### # 返回节点的序列号 def get_lock_node_seq(node_name): # 分割字符串,然后返回列表最后一个元素,先将其转化为整型 return string.atoi(node_name.split("-")[-1]) # 编号比较r函数 def sequence_compare(node1, node2): return get_lock_node_seq(node1) - get_lock_node_seq(node2) # 将列表排序 children.sort(cmp = sequence_compare) return children # 停止工作 def stop(self): # 移除事件监听器 self.zkclient.remove_listener(self.pre_node_delete_watch) # 会话 self.zkclient.stop() self.zkclient.close()
# argument parser setup parser = argparse.ArgumentParser(description='Script description TODO') parser.add_argument('application', metavar='"app"', help='application to run after creating znode /z') application_to_run = parser.parse_args().application print("Aplication to run when /z node exists is \"" + application_to_run + "\"") z_already_exists = False application_process = None servers_addresses = ['127.0.0.1:2181', '127.0.0.1:2182', '127.0.0.1:2183'] hosts = ','.join(servers_addresses) # start kazoo cliend, then add state listener and /z node watcher zk = KazooClient(hosts=hosts) zk.start() zk.add_listener(state_listener) zk.DataWatch("/z", watch_node) # handle user commands while True: command = input('Type the command ( tree | quit )\n') if command == 'tree': visualize_z_tree() elif command == 'quit': if z_already_exists: kill_app() break else: print('Incorrect command') zk.stop()
class ZkClient: def __init__(self, zk_servers, app): self.app = app self.zk = KazooClient(hosts=zk_servers) self.zk.start() self.server_node_path = "/entry/service" self.node = "/entry/serviceinfo/node/loan_mng" self.os_center_node = "/entry/service/os_center/node" self.loan_mng_hosts = "" self.zk.DataWatch(self.node, self.get_loan_mng_hosts) self.zk.DataWatch(self.os_center_node, self.get_os_center_hosts) self.zk.DataWatch(self.server_node_path, self.get_servers_node) def get_loan_mng_hosts(self, *args): try: data = json.loads(self.zk.get(self.node)[0]) ip = data["node_list"][0]["ip"] port = data["node_list"][0]["port"] host = "http://{}:{}".format(ip, port) logger.info("ZK | GET LOAN_MNG HOSTS | SUCCESS | HOST: {}".format(host)) self.loan_mng_hosts = host return host except Exception as e: logger.info("ZK | GET LOAN_MNG HOSTS | FAILED | ERROR: {}".format(str(e))) self.loan_mng_hosts = "" def get_os_center_hosts(self, *args): try: children = self.zk.get_children(self.os_center_node) node = children[0] data = self.zk.get(self.os_center_node+"/"+node)[0].decode() host = "http://{}".format(data) logger.info("ZK | GET OS_CENTER HOSTS | SUCCESS | HOST: {}".format(host)) return host except Exception as e: logger.info("ZK | GET OS_CENTER HOSTS | FAILED | ERROR: {}".format(str(e))) return "" def get_config(self, category): path = ConfigNameMap.zk_path[category] if not self.zk.exists(path): self.zk.create(path, json.dumps({}).encode()) try: data = json.loads(self.zk.get(path)[0].decode()) return data except Exception as e: logger.info("ZK | GET CONFIG | FAILED | CATEGORY: {}| ERROR: {}".format(category, str(e))) return {} def write_config(self, category, config): path = ConfigNameMap.zk_path[category] try: self.zk.ensure_path(path) if not self.zk.exists(path): self.zk.create(path, json.dumps({}).encode()) self.zk.set(path, json.dumps(config).encode()) return True except Exception as e: logger.info("ZK | SYNC CONFIG | FAILED | CATEGORY: {}| ERROR: {}".format(category, str(e))) return False def get_servers_node(self, *args): """ 获取所有服务的注册节点 """ servers_node = [] def _get_childern(path): try: reg = self.zk.get_children(path) return reg except Exception as e: return [] def _get_data(path): try: data = self.zk.get(path)[0].decode() return data except Exception as e: return None try: all_server = self.zk.get_children(self.server_node_path) for server_name in all_server: path = "{}/{}/node".format(self.server_node_path, server_name) registration = _get_childern(path) data = [] for i in registration: node_data = _get_data(path+"/"+i) data.append(node_data) servers_node.append({ "name": server_name, "node": list(set(data)) }) return servers_node except NoNodeError as e: logger.warn("NO NODE ERROR | NODE PATH {}".format(self.server_node_path)) return []