Exemple #1
0
def watch_zookeeper_nodes(zookeeper: KazooClient, nodes: Any) -> NoReturn:
    for node in nodes:
        watcher = NodeWatcher(node.dest, node.owner, node.group, node.mode)
        zookeeper.DataWatch(node.source, watcher.on_change)

    # all the interesting stuff is now happening in the Kazoo worker thread
    # and so we'll just spin and periodically heartbeat to prove we're alive.
    while True:
        time.sleep(HEARTBEAT_INTERVAL)

        # see the comment in baseplate.live_data.zookeeper for explanation of
        # how reconnects work with the background thread.
        if zookeeper.connected:
            for node in nodes:
                try:
                    logger.debug("Heartbeating %s", node.dest)

                    # this will make FileWatchers re-parse the file on the next
                    # read which is unfortunate but we do it anyway. it's
                    # important to monitor that the file is being updated as
                    # accurately as possible rather than using a separate file
                    # or mechanism as a proxy. for example, the actual output
                    # file could have bogus permissions that would go unnoticed
                    # if the heartbeat still worked independently.
                    os.utime(node.dest, None)
                except OSError as exc:
                    logger.warning("%s: could not heartbeat: %s", node.dest,
                                   exc)
Exemple #2
0
def get_zk():
    global _zk
    if _zk is None:
        _zk = KazooClient(
            app.config['ZK_CONNECTION_STRING'],
            default_acl=(
                # grants read permissions to anyone.
                make_acl('world', 'anyone', read=True),
                # grants all permissions to the creator of the node.
                make_acl('auth', '', all=True)
            )
        )
        _zk.start()
        _zk.add_auth('digest', jones_credential)
        _zk.DataWatch('/services', func=ensure_root)
    return _zk
Exemple #3
0
class ZKClient:
    def __init__(self, zk_servers, service_name, host):
        self.zk = KazooClient(zk_servers)
        self.zk.start()

        self.service_name = service_name
        self.serve_path = "/entry/service/{}/node".format(service_name)
        self.zk.ensure_path(self.serve_path)
        self.zk.create(self.serve_path + "/server",
                       host.encode(),
                       ephemeral=True,
                       sequence=True)
        self.config_path = "/entry/config/service/{}".format(self.service_name)
        self.zk.DataWatch(self.config_path, self.read_config)

    def read_config(self, *args):
        self.zk.ensure_path("/entry/config/service")
        if not self.zk.exists(self.config_path):
            self.zk.create(self.config_path, json.dumps({}).encode())
        self.config = json.loads(self.zk.get(self.config_path)[0].decode())

    def update_config(self, config):
        self.zk.set(self.config_path, json.dumps(config).encode())
Exemple #4
0
class ZooKeeper(object):
    '''
    Class implementing the ZooKeeper interface.

    This class uses the facade design pattern to keep common interaction
    with the ZooKeeper API simple and consistent for the caller, and
    limits coupling between objects. It allows for more complex interactions
    by providing direct access to the client connection when needed (though
    that is discouraged). It also provides for a convenient entry point for
    testing only ZooKeeper interactions.
    '''

    log = logging.getLogger("zuul.zk.ZooKeeper")

    REQUEST_ROOT = '/nodepool/requests'
    NODE_ROOT = '/nodepool/nodes'

    # Log zookeeper retry every 10 seconds
    retry_log_rate = 10

    def __init__(self):
        '''
        Initialize the ZooKeeper object.
        '''
        self.client = None
        self._became_lost = False
        self._last_retry_log = 0

    def _dictToStr(self, data):
        return json.dumps(data).encode('utf8')

    def _strToDict(self, data):
        return json.loads(data.decode('utf8'))

    def _connection_listener(self, state):
        '''
        Listener method for Kazoo connection state changes.

        .. warning:: This method must not block.
        '''
        if state == KazooState.LOST:
            self.log.debug("ZooKeeper connection: LOST")
            self._became_lost = True
        elif state == KazooState.SUSPENDED:
            self.log.debug("ZooKeeper connection: SUSPENDED")
        else:
            self.log.debug("ZooKeeper connection: CONNECTED")

    @property
    def connected(self):
        return self.client.state == KazooState.CONNECTED

    @property
    def suspended(self):
        return self.client.state == KazooState.SUSPENDED

    @property
    def lost(self):
        return self.client.state == KazooState.LOST

    @property
    def didLoseConnection(self):
        return self._became_lost

    def resetLostFlag(self):
        self._became_lost = False

    def logConnectionRetryEvent(self):
        now = time.monotonic()
        if now - self._last_retry_log >= self.retry_log_rate:
            self.log.warning("Retrying zookeeper connection")
            self._last_retry_log = now

    def connect(self, hosts, read_only=False, timeout=10.0):
        '''
        Establish a connection with ZooKeeper cluster.

        Convenience method if a pre-existing ZooKeeper connection is not
        supplied to the ZooKeeper object at instantiation time.

        :param str hosts: Comma-separated list of hosts to connect to (e.g.
            127.0.0.1:2181,127.0.0.1:2182,[::1]:2183).
        :param bool read_only: If True, establishes a read-only connection.
        :param float timeout: The ZooKeeper session timeout, in
            seconds (default: 10.0).
        '''
        if self.client is None:
            self.client = KazooClient(hosts=hosts,
                                      read_only=read_only,
                                      timeout=timeout)
            self.client.add_listener(self._connection_listener)
            # Manually retry initial connection attempt
            while True:
                try:
                    self.client.start(1)
                    break
                except KazooTimeoutError:
                    self.logConnectionRetryEvent()

    def disconnect(self):
        '''
        Close the ZooKeeper cluster connection.

        You should call this method if you used connect() to establish a
        cluster connection.
        '''
        if self.client is not None and self.client.connected:
            self.client.stop()
            self.client.close()
            self.client = None

    def resetHosts(self, hosts):
        '''
        Reset the ZooKeeper cluster connection host list.

        :param str hosts: Comma-separated list of hosts to connect to (e.g.
            127.0.0.1:2181,127.0.0.1:2182,[::1]:2183).
        '''
        if self.client is not None:
            self.client.set_hosts(hosts=hosts)

    def submitNodeRequest(self, node_request, watcher):
        '''
        Submit a request for nodes to Nodepool.

        :param NodeRequest node_request: A NodeRequest with the
            contents of the request.

        :param callable watcher: A callable object that will be
            invoked each time the request is updated.  It is called
            with two arguments: (node_request, deleted) where
            node_request is the same argument passed to this method,
            and deleted is a boolean which is True if the node no
            longer exists (notably, this will happen on disconnection
            from ZooKeeper).  The watcher should return False when
            further updates are no longer necessary.
        '''
        data = node_request.toDict()
        data['created_time'] = time.time()

        path = '%s/%s-' % (self.REQUEST_ROOT, node_request.priority)
        path = self.client.create(path,
                                  self._dictToStr(data),
                                  makepath=True,
                                  sequence=True,
                                  ephemeral=True)
        reqid = path.split("/")[-1]
        node_request.id = reqid

        def callback(data, stat):
            if data:
                data = self._strToDict(data)
                request_nodes = list(node_request.nodeset.getNodes())
                for i, nodeid in enumerate(data.get('nodes', [])):
                    node_path = '%s/%s' % (self.NODE_ROOT, nodeid)
                    node_data, node_stat = self.client.get(node_path)
                    node_data = self._strToDict(node_data)
                    request_nodes[i].id = nodeid
                    request_nodes[i].updateFromDict(node_data)
                node_request.updateFromDict(data)
            deleted = (data is None)  # data *are* none
            return watcher(node_request, deleted)

        self.client.DataWatch(path, callback)

    def deleteNodeRequest(self, node_request):
        '''
        Delete a request for nodes.

        :param NodeRequest node_request: A NodeRequest with the
            contents of the request.
        '''

        path = '%s/%s' % (self.REQUEST_ROOT, node_request.id)
        try:
            self.client.delete(path)
        except kze.NoNodeError:
            pass

    def nodeRequestExists(self, node_request):
        '''
        See if a NodeRequest exists in ZooKeeper.

        :param NodeRequest node_request: A NodeRequest to verify.

        :returns: True if the request exists, False otherwise.
        '''
        path = '%s/%s' % (self.REQUEST_ROOT, node_request.id)
        if self.client.exists(path):
            return True
        return False

    def storeNode(self, node):
        '''Store the node.

        The node is expected to already exist and is updated in its
        entirety.

        :param Node node: The node to update.
        '''

        path = '%s/%s' % (self.NODE_ROOT, node.id)
        self.client.set(path, self._dictToStr(node.toDict()))

    def lockNode(self, node, blocking=True, timeout=None):
        '''
        Lock a node.

        This should be called as soon as a request is fulfilled and
        the lock held for as long as the node is in-use.  It can be
        used by nodepool to detect if Zuul has gone offline and the
        node should be reclaimed.

        :param Node node: The node which should be locked.
        '''

        lock_path = '%s/%s/lock' % (self.NODE_ROOT, node.id)
        try:
            lock = Lock(self.client, lock_path)
            have_lock = lock.acquire(blocking, timeout)
        except kze.LockTimeout:
            raise LockException("Timeout trying to acquire lock %s" %
                                lock_path)

        # If we aren't blocking, it's possible we didn't get the lock
        # because someone else has it.
        if not have_lock:
            raise LockException("Did not get lock on %s" % lock_path)

        node.lock = lock

    def unlockNode(self, node):
        '''
        Unlock a node.

        The node must already have been locked.

        :param Node node: The node which should be unlocked.
        '''

        if node.lock is None:
            raise LockException("Node %s does not hold a lock" % (node, ))
        node.lock.release()
        node.lock = None

    def heldNodeCount(self, autohold_key):
        '''
        Count the number of nodes being held for the given tenant/project/job.

        :param set autohold_key: A set with the tenant/project/job names.
        '''
        identifier = " ".join(autohold_key)
        try:
            nodes = self.client.get_children(self.NODE_ROOT)
        except kze.NoNodeError:
            return 0

        count = 0
        for nodeid in nodes:
            node_path = '%s/%s' % (self.NODE_ROOT, nodeid)
            node_data, node_stat = self.client.get(node_path)
            if not node_data:
                self.log.warning("Node ID %s has no data", nodeid)
                continue
            node_data = self._strToDict(node_data)
            if (node_data['state'] == zuul.model.STATE_HOLD
                    and node_data.get('hold_job') == identifier):
                count += 1
        return count
Exemple #5
0
class ZK(object):
    def __init__(self, cluster, zkservers=None):
        if not zkservers:
            zkservers = get_zkservers()
        self.cluster = cluster
        self.zkpath = "/beansdb/%s" % cluster
        self.zk = KazooClient(hosts=zkservers)
        self.zk.start()
        self.zk.ensure_path(self.zkpath)

    def _path_route(self):
        """store curr version.
           children store route.yaml`s in sequence, start with routes_
        """
        return "%s/route" % self.zkpath

    def _path_gcs(self):
        return "%s/gc" % self.zkpath

    def _path_gc_host(self, host):
        return "%s/%s" % (self._path_gcs(), host)

    def _path_gc_bucket(self, host, bucket_str):
        return "%s/%s" % (self._path_gc_host(host), bucket_str)

    def _path_backup(self):
        return "%s/backup" % self.zkpath

    def _path_proxy(self):
        return "%s/proxy" % self.zkpath

    def _path_servers(self):
        return "%s/servers" % self.zkpath

    def _path_disks(self):
        return "%s/disks" % self.zkpath

    def _path_disk(self, host):
        return "%s/%s" % (self._path_disks(), host)

    def _path_jobs(self):
        """store rerouting job
           children store jobs in pickle
        """
        return "%s/jobs" % self.zkpath

    def _path_job(self, key):
        return "%s/%s" % (self._path_jobs(), key)

    def path_jobs(self):
        return self._path_jobs()

    def path_job(self, key):
        return self._path_job(key)

    def _path_migrate(self):
        return "%s/migrate" % self.zkpath

    def path_migrate_status(self, host):
        return "%s/%s" % (self._path_migrate(), host)

    def path_prepared_lock(self):
        return "%s/preparedlock" % self.zkpath

    def path_migrate_lock(self):
        return "%s/migratelock" % self.zkpath

    def path_prepared_jobs(self):
        return "%s/prepared_jobs" % self.zkpath

    def path_prepared_job(self, key):
        return "%s/%s" % (self.path_prepared_jobs(), key)

    def path_err_jobs(self):
        return "%s/error_jobs" % self.zkpath

    def path_err_job(self, key):
        return "%s/%s" % (self.path_err_jobs(), key)

    def reroute_set(self, key):
        path = self._path_jobs()
        self.zk.set(path, key)

    def reroute_get(self):
        path = self._path_jobs()
        curr = self.zk.get(path)[0]
        return curr

    def reroute_clear(self):
        self.reroute_set("None")

    def all_server_set(self, content):
        path = self._path_servers()
        self.zk.ensure_path(path)
        self.zk.set(path, json.dumps(content))

    def all_server_get(self):
        if not self.zk.exists(self._path_servers()):
            return []
        raw = json.loads(self.zk.get(self._path_servers())[0])
        return [host.encode('utf-8') for host in raw]

    def disk_info_set(self, host, content={}):
        """
        store host's disk info
        """
        data, _ = self._ensure_zk_path(self._path_disks())
        disk_info = json.loads(data)
        disk_info[host] = content
        self.zk.set(self._path_disks(), json.dumps(disk_info))
        return disk_info

    def disk_info_get(self, host):
        data, _ = self._ensure_zk_path(self._path_disks())
        disk_info = json.loads(data)
        return disk_info.get(host)

    def _ensure_zk_path(self, path):
        if not self.zk.exists(path):
            self.zk.ensure_path(path)
            self.zk.set(path, json.dumps({}))
        return self.zk.get(path)

    def migrate_status_get(self, host):
        path = self.path_migrate_status(host)
        status, _ = self.zk.get(path)
        return status

    def migrate_status_set(self, host, status):
        path = self.path_migrate_status(host)
        self.zk.ensure_path(path)
        self.zk.set(path, status)

    def route_set(self, content, commit=False):
        path = self._path_route()
        res = self.zk.create(path + '/route_', content, sequence=True)
        ver = int(res[-10:])
        if commit:
            self.route_version_set(ver)
        return ver

    def route_get(self, ver=-1):
        path = self._path_route()
        if ver < 0:
            ver = int(self.zk.get(path)[0])
        return self.zk.get(path + "/route_%010d" % ver)[0]

    def route_version_set(self, ver):
        path = self._path_route()
        self.zk.set(path, str(ver))

    def route_version_get(self):
        path = self._path_route()
        return int(self.zk.get(path)[0])

    def route_verison_get_all(self):
        path = self._path_route()
        vers = self.zk.get_children(path)
        return sorted([int(r[-10:]) for r in vers])

    def route_verison_get_newest(self):
        return max(self.route_verison_get_all())

    def route_watch(self, func):
        path = self._path_route()
        self.zk.DataWatch(path)(func)

    def gc_get(self):
        buckets = self.zk.get_children(self._path_gcs())
        return dict([(b, self.zk.get(self._path_gc(b))[0]) for b in buckets])

    def gc_set(self, buckets, state):
        """ gc cron set busy and idle
            migrate cron set block and idle"""
        assert state in ("busy", "idle", "block")
        paths = [self._path_gc(bucket) for bucket in buckets]
        for p in paths:
            self.zk.ensure_path(p)
        if state == "block":
            busy = [p for p in paths if self.zk.get(p) == "busy"]
            if len(busy) > 0:
                return busy
        for p in paths:
            self.zk.set(p, state)

    def gc_set_bucket(self, host, bucket, state):
        assert state in ("busy", "idle", "block")
        path = self._path_gc_bucket(host, bucket)
        self.zk.ensure_path(path)
        if state == "block":
            busy = path if self.zk.get(path) == "busy" else ""
            if busy:
                return busy
        self.zk.set(path, state)

    def gc_get_bucket(self, host, bucket):
        path = self._path_gc_bucket(host, bucket)
        return self.zk.get(path)[0]

    def gc_get_status(self, host):
        path = self._path_gc_host(host)
        if self.zk.exists(path):
            return self.zk.get_children(path)

    def gc_unblock_bucket(self, host, bucket):
        stats = self.gc_get_bucket(host, bucket)
        if stats == 'block':
            self.gc_set_bucket(host, bucket, 'idle')

    def gc_unblock(self):
        keys = self.job_list()
        buckets = set([key.split("_")[-1] for key in keys])

        to_unblock = []
        stats = self.gc_get()
        for b, s in stats.items():
            if s == 'block' and s not in buckets:
                to_unblock.append(b)
        if len(to_unblock):
            logger.info('unblock gc: %s', to_unblock)
            self.gc_set(to_unblock, 'idle')

    def proxies_get(self):
        data, _ = self.zk.get(self._path_proxy())
        return json.loads(data)

    def proxies_set(self, addrs):
        path = self._path_proxy()
        self.zk.set(path, json.dumps(addrs))

    def backup_get(self):
        data, _ = self.zk.get(self._path_backup())
        return json.loads(data)

    def backup_set(self, dic):
        self.zk.set(self._path_backup(), json.dumps(dic))

    def job_get(self, key):
        return pickle.loads(self.zk.get(self._path_job(key))[0])

    def job_delete(self, key):
        self.zk.delete(self._path_job(key))

    def job_set(self, key, job):
        path = self._path_job(key)
        self.zk.set(path, pickle.dumps(job))

    def job_create(self, key, job):
        path = self._path_job(key)
        self.zk.ensure_path(self.path_jobs())
        self.zk.create(path, pickle.dumps(job))

    def job_exist(self, key):
        return self.zk.exists(self._path_job(key))

    def job_list(self):
        return self.zk.get_children(self._path_jobs())

    def prepared_job_set(self, key, job):
        path = self.path_prepared_job(key)
        self.zk.ensure_path(self.path_prepared_jobs())
        self.zk.create(path, pickle.dumps(job))

    def prepared_job_get(self, key):
        return pickle.loads(self.zk.get(self.path_prepared_job(key))[0])

    def prepared_job_delete(self, key):
        self.zk.delete(self.path_prepared_job(key))

    def prepared_job_exist(self, key):
        return self.zk.exists(self.path_prepared_job(key))

    def err_job_set(self, key, job):
        path = self.path_err_job(key)
        self.zk.ensure_path(self.path_err_jobs())
        self.zk.create(path, pickle.dumps(job))

    def err_job_get(self, key):
        return pickle.loads(self.zk.get(self.path_err_job(key))[0])

    def err_job_delete(self, key):
        self.zk.delete(self.path_err_job(key))

    def err_job_exist(self, key):
        return self.zk.exists(self.path_err_job(key))
Exemple #6
0
class ZooKeeper(object):
    # Constants used by the REST API:
    LIVE_NODES_ZKNODE = '/live_nodes'
    ALIASES = '/aliases.json'
    COLLECTION_STATUS = '/collections'
    CLUSTER_DETAILED_STATE = '/collections/%s/state.json'
    CLUSTER_STATE = '/clusterstate.json'
    SHARDS = 'shards'
    REPLICAS = 'replicas'
    STATE = 'state'
    ACTIVE = 'active'
    LEADER = 'leader'
    BASE_URL = 'base_url'
    TRUE = 'true'
    FALSE = 'false'
    COLLECTION = 'collection'

    def __init__(self, zkServerAddress, timeout=15, max_retries=-1, kazoo_client=None):
        if KazooClient is None:
            logging.error('ZooKeeper requires the `kazoo` library to be installed')
            raise RuntimeError

        self.collections = {}
        self.liveNodes = {}
        self.aliases = {}
        self.state = None

        if kazoo_client is None:
            self.zk = KazooClient(zkServerAddress, read_only=True, timeout=timeout,
                                  command_retry={'max_tries': max_retries},
                                  connection_retry={'max_tries': max_retries})
        else:
            self.zk = kazoo_client

        self.zk.start()

        def connectionListener(state):
            if state == KazooState.LOST:
                self.state = state
            elif state == KazooState.SUSPENDED:
                self.state = state

        self.zk.add_listener(connectionListener)

        def watchClusterDetailedState(data, *args, **kwargs):
            if not data:
                LOG.warning("No cluster state available: no collections defined?")
            else:
                self.collections.update(json.loads(data.decode('utf-8')))
                LOG.info('Updated collections: %s', self.collections)

        @self.zk.ChildrenWatch(ZooKeeper.COLLECTION_STATUS)
        def watchClusterState(children):
            LOG.info("Updated collection: %s", children)
            for child in children:
                self.zk.DataWatch(self.CLUSTER_DETAILED_STATE % child, watchClusterDetailedState)

        @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE)
        def watchClusterState(data, *args, **kwargs):
            if not data:
                LOG.warning("No cluster state available: no collections defined?")
            else:
                self.collections = json.loads(data.decode('utf-8'))
                LOG.info('Updated collections: %s', self.collections)

        @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE)
        def watchLiveNodes(children):
            self.liveNodes = children
            LOG.info("Updated live nodes: %s", children)

        @self.zk.DataWatch(ZooKeeper.ALIASES)
        def watchAliases(data, stat):
            if data:
                json_data = json.loads(data.decode('utf-8'))
                if ZooKeeper.COLLECTION in json_data:
                    self.aliases = json_data[ZooKeeper.COLLECTION]
                else:
                    LOG.warning('Expected to find %s in alias update %s',
                                ZooKeeper.COLLECTION, json_data.keys())
            else:
                self.aliases = None
            LOG.info("Updated aliases: %s", self.aliases)

    def getHosts(self, collname, only_leader=False, seen_aliases=None):
        if self.aliases and collname in self.aliases:
            return self.getAliasHosts(collname, only_leader, seen_aliases)

        hosts = []
        if collname not in self.collections:
            raise SolrError("Unknown collection: %s" % collname)
        collection = self.collections[collname]
        shards = collection[ZooKeeper.SHARDS]
        for shardname in shards.keys():
            shard = shards[shardname]
            if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE:
                replicas = shard[ZooKeeper.REPLICAS]
                for replicaname in replicas.keys():
                    replica = replicas[replicaname]

                    if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE:
                        if not only_leader or (replica.get(ZooKeeper.LEADER, None) == ZooKeeper.TRUE):
                            base_url = replica[ZooKeeper.BASE_URL]
                            if base_url not in hosts:
                                hosts.append(base_url)
        return hosts

    def getAliasHosts(self, collname, only_leader, seen_aliases):
        if seen_aliases:
            if collname in seen_aliases:
                LOG.warn("%s in circular alias definition - ignored", collname)
                return []
        else:
            seen_aliases = []
        seen_aliases.append(collname)
        collections = self.aliases[collname].split(",")
        hosts = []
        for collection in collections:
            for host in self.getHosts(collection, only_leader, seen_aliases):
                if host not in hosts:
                    hosts.append(host)
        return hosts

    def getRandomURL(self, collname, only_leader=False):
        hosts = self.getHosts(collname, only_leader=only_leader)
        if not hosts:
            raise SolrError('ZooKeeper returned no active shards!')
        return '%s/%s' % (random.choice(hosts), collname)

    def getLeaderURL(self, collname):
        return self.getRandomURL(collname, only_leader=True)
Exemple #7
0
class ServiceRegister(object):
    def __init__(self, hosts="127.0.0.1:2181", read_only=True, logger=None):
        """
        服务注册
        :param hosts: Zookeeper集群地址列表
        :param read_only: 是否只读
        :param logger: 日志对象
        """
        if not logger:
            import logging
            logging.basicConfig()
        self._zk = KazooClient(hosts, read_only=read_only, logger=logger)
        self._zk.start()

    def restart(self):
        self._zk.restart()

    def retry_get(self, path, watcher=None):
        """
        重读
        :param path: 节点路由
        :param watcher: 观察者回调函数
        :return: 成功:节点值,版本号;失败:异常信息,异常代码。
        """
        return self._zk.retry(self.get, path, watcher)

    def lock(self, path, identifier, timeout=None):
        """
        分布式锁
        :param path: 路由
        :param identifier: 锁标识
        :param timeout: 超时时间
        :return: 锁对象
        """
        return DLock(self._zk, path, identifier, timeout)

    def exist(self, path):
        """
        节点是否存在
        :param path: 路由
        :return: 存在返回True,不存在返回False。
        """
        state = self._zk.exists(path)
        return state is not None

    def create(self, path, value=""):
        """
        创建节点
        :param path: 节点路由
        :param value: 节点值
        :return: 节点路由
        """
        try:
            res_path = self._zk.create(path, value, makepath=True)
        except NodeExistsError:
            return path
        except NoNodeError as e:
            return e.message
        except ZookeeperError as e:
            return e.message
        else:
            return res_path

    def get(self, path, watcher=None):
        """
        查节点值
        :param path: 节点路由
        :param watcher: 观察者回调函数
        :return: 成功:节点值,版本号;失败:异常信息,异常代码。
        """
        try:
            data, state = self._zk.get(path)
            self._zk.DataWatch(path, watcher)
        except NoNodeError as e:
            return e.message, -2
        except ZookeeperError as e:
            return e.message, -3
        else:
            return data, state.version

    def get_children(self, path, watcher=None):
        """
        查子节点列表
        :param path: 节点路由
        :param watcher: 观察者回调函数
        :return: 子节点列表
        """
        try:
            data = self._zk.get_children(path)
            self._zk.DataWatch(path, watcher)
        except NoNodeError as e:
            return [], -2
        except ZookeeperError as e:
            return [], -3
        else:
            return data, 0

    def set(self, path, value, version=-1):
        """
        改节点值
        :param path: 节点路由
        :param value: 节点值
        :param version: 成功:版本号;失败:异常信息。
        """
        try:
            state = self._zk.set(path, value, version)
        except BadVersionError as e:
            return e.message
        except NoNodeError as e:
            return e.message
        except ZookeeperError as e:
            return e.message
        else:
            return state.version
Exemple #8
0
class AnalyticsDiscovery(gevent.Greenlet):
    def _sandesh_connection_info_update(self, status, message):

        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER,
                               name=self._svc_name,
                               status=new_conn_state,
                               message=message,
                               server_addrs=self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN
                and new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' % (message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state
                and new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state

    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Analytics Discovery listen %s" % str(state))
        if state == KazooState.CONNECTED:
            self._sandesh_connection_info_update(status='UP', message='')
            self._logger.error("Analytics Discovery to publish %s" %
                               str(self._pubinfo))
            self._reconnect = True
        elif state == KazooState.LOST:
            self._logger.error("Analytics Discovery connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all
            # over again
            self._sandesh_connection_info_update(
                status='DOWN', message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Analytics Discovery connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(
                status='INIT',
                message='Connection to zookeeper lost. Retrying')

    def _zk_datawatch(self, watcher, child, data, stat, event="unknown"):
        self._logger.error(\
                "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                (watcher, child, data, event))
        if data:
            data_dict = json.loads(data)
            self._wchildren[watcher][child] = OrderedDict(
                sorted(data_dict.items()))
        else:
            if child in self._wchildren[watcher]:
                del self._wchildren[watcher][child]
        if self._watchers[watcher]:
            self._pendingcb.add(watcher)

    def _zk_watcher(self, watcher, children):
        self._logger.error("Analytics Discovery Children %s" % children)
        self._reconnect = True

    def __init__(self,
                 logger,
                 zkservers,
                 svc_name,
                 inst,
                 watchers={},
                 zpostfix="",
                 freq=10):
        gevent.Greenlet.__init__(self)
        self._svc_name = svc_name
        self._inst = inst
        self._zk_server = zkservers
        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')
        self._zkservers = zkservers
        self._zk = None
        self._pubinfo = None
        self._publock = Semaphore()
        self._watchers = watchers
        self._wchildren = {}
        self._pendingcb = set()
        self._zpostfix = zpostfix
        self._basepath = "/analytics-discovery-" + self._zpostfix
        self._reconnect = None
        self._freq = freq

    def publish(self, pubinfo):

        # This function can be called concurrently by the main AlarmDiscovery
        # processing loop as well as by clients.
        # It is NOT re-entrant
        self._publock.acquire()

        self._pubinfo = pubinfo
        if self._conn_state == ConnectionStatus.UP:
            try:
                self._logger.error("ensure %s" %
                                   (self._basepath + "/" + self._svc_name))
                self._logger.error("zk state %s (%s)" %
                                   (self._zk.state, self._zk.client_state))
                self._zk.ensure_path(self._basepath + "/" + self._svc_name)
                self._logger.error("check for %s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))
                if pubinfo is not None:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._zk.set("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo)
                    else:
                        self._zk.create("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo, ephemeral=True)
                else:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._logger.error("withdrawing published info!")
                        self._zk.delete("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))

            except Exception as ex:
                template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                self._sandesh_connection_info_update(status='DOWN', message='')
                self._reconnect = True
        else:
            self._logger.error("Analytics Discovery cannot publish while down")
        self._publock.release()

    def _run(self):
        while True:
            self._logger.error("Analytics Discovery zk start")
            self._zk = KazooClient(hosts=self._zkservers)
            self._zk.add_listener(self._zk_listen)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_listen)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._svc_name))
                finally:
                    self._zk = None
                gevent.sleep(1)

        try:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
            self._reconnect = False
            # Done connecting to ZooKeeper

            for wk in self._watchers.keys():
                self._zk.ensure_path(self._basepath + "/" + wk)
                self._wchildren[wk] = {}
                self._zk.ChildrenWatch(self._basepath + "/" + wk,
                                       partial(self._zk_watcher, wk))

            # Trigger the initial publish
            self._reconnect = True

            while True:
                try:
                    if not self._reconnect:
                        pending_list = list(self._pendingcb)
                        self._pendingcb = set()
                        for wk in pending_list:
                            if self._watchers[wk]:
                                self._watchers[wk](\
                                        sorted(self._wchildren[wk].values()))

                    # If a reconnect happens during processing, don't lose it
                    while self._reconnect:
                        self._logger.error("Analytics Discovery %s reconnect" \
                                % self._svc_name)
                        self._reconnect = False
                        self._pendingcb = set()
                        self.publish(self._pubinfo)

                        for wk in self._watchers.keys():
                            self._zk.ensure_path(self._basepath + "/" + wk)
                            children = self._zk.get_children(self._basepath +
                                                             "/" + wk)

                            old_children = set(self._wchildren[wk].keys())
                            new_children = set(children)

                            # Remove contents for the children who are gone
                            # (DO NOT remove the watch)
                            for elem in old_children - new_children:
                                del self._wchildren[wk][elem]

                            # Overwrite existing children, or create new ones
                            for elem in new_children:
                                # Create a watch for new children
                                if elem not in self._wchildren[wk]:
                                    self._zk.DataWatch(self._basepath + "/" + \
                                            wk + "/" + elem,
                                            partial(self._zk_datawatch, wk, elem))

                                data_str, _ = self._zk.get(\
                                        self._basepath + "/" + wk + "/" + elem)
                                data_dict = json.loads(data_str)
                                self._wchildren[wk][elem] = \
                                        OrderedDict(sorted(data_dict.items()))

                                self._logger.error(\
                                    "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                                    (wk, elem, self._wchildren[wk][elem], "GET"))
                            if self._watchers[wk]:
                                self._watchers[wk](sorted(
                                    self._wchildren[wk].values()))

                    gevent.sleep(self._freq)
                except gevent.GreenletExit:
                    self._logger.error("Exiting AnalyticsDiscovery for %s" % \
                            self._svc_name)
                    self._zk.remove_listener(self._zk_listen)
                    gevent.sleep(1)
                    try:
                        self._zk.stop()
                    except:
                        self._logger.error("Stopping kazooclient failed")
                    else:
                        self._logger.error("Stopping kazooclient successful")
                    try:
                        self._zk.close()
                    except:
                        self._logger.error("Closing kazooclient failed")
                    else:
                        self._logger.error("Closing kazooclient successful")
                    break

                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                    self._reconnect = True

        except Exception as ex:
            template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}"
            messag = template.format(type(ex).__name__, ex.args)
            self._logger.error("%s : traceback %s for %s info %s" % \
                    (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
            raise SystemExit
Exemple #9
0
class ZooKeeper(object):
    '''
    Class implementing the ZooKeeper interface.

    This class uses the facade design pattern to keep common interaction
    with the ZooKeeper API simple and consistent for the caller, and
    limits coupling between objects. It allows for more complex interactions
    by providing direct access to the client connection when needed (though
    that is discouraged). It also provides for a convenient entry point for
    testing only ZooKeeper interactions.
    '''

    log = logging.getLogger("zuul.zk.ZooKeeper")

    REQUEST_ROOT = '/nodepool/requests'
    REQUEST_LOCK_ROOT = "/nodepool/requests-lock"
    NODE_ROOT = '/nodepool/nodes'
    HOLD_REQUEST_ROOT = '/zuul/hold-requests'

    # Log zookeeper retry every 10 seconds
    retry_log_rate = 10

    def __init__(self, enable_cache=True):
        '''
        Initialize the ZooKeeper object.

        :param bool enable_cache: When True, enables caching of ZooKeeper
            objects (e.g., HoldRequests).
        '''
        self.client = None
        self._became_lost = False
        self._last_retry_log = 0
        self.enable_cache = enable_cache

        # The caching model we use is designed around handing out model
        # data as objects. To do this, we use two caches: one is a TreeCache
        # which contains raw znode data (among other details), and one for
        # storing that data serialized as objects. This allows us to return
        # objects from the APIs, and avoids calling the methods to serialize
        # the data into objects more than once.
        self._hold_request_tree = None
        self._cached_hold_requests = {}

    def _dictToStr(self, data):
        return json.dumps(data).encode('utf8')

    def _strToDict(self, data):
        return json.loads(data.decode('utf8'))

    def _connection_listener(self, state):
        '''
        Listener method for Kazoo connection state changes.

        .. warning:: This method must not block.
        '''
        if state == KazooState.LOST:
            self.log.debug("ZooKeeper connection: LOST")
            self._became_lost = True
        elif state == KazooState.SUSPENDED:
            self.log.debug("ZooKeeper connection: SUSPENDED")
        else:
            self.log.debug("ZooKeeper connection: CONNECTED")

    @property
    def connected(self):
        return self.client.state == KazooState.CONNECTED

    @property
    def suspended(self):
        return self.client.state == KazooState.SUSPENDED

    @property
    def lost(self):
        return self.client.state == KazooState.LOST

    @property
    def didLoseConnection(self):
        return self._became_lost

    def resetLostFlag(self):
        self._became_lost = False

    def logConnectionRetryEvent(self):
        now = time.monotonic()
        if now - self._last_retry_log >= self.retry_log_rate:
            self.log.warning("Retrying zookeeper connection")
            self._last_retry_log = now

    def connect(self, hosts, read_only=False, timeout=10.0):
        '''
        Establish a connection with ZooKeeper cluster.

        Convenience method if a pre-existing ZooKeeper connection is not
        supplied to the ZooKeeper object at instantiation time.

        :param str hosts: Comma-separated list of hosts to connect to (e.g.
            127.0.0.1:2181,127.0.0.1:2182,[::1]:2183).
        :param bool read_only: If True, establishes a read-only connection.
        :param float timeout: The ZooKeeper session timeout, in
            seconds (default: 10.0).
        '''
        if self.client is None:
            self.client = KazooClient(hosts=hosts,
                                      read_only=read_only,
                                      timeout=timeout)
            self.client.add_listener(self._connection_listener)
            # Manually retry initial connection attempt
            while True:
                try:
                    self.client.start(1)
                    break
                except KazooTimeoutError:
                    self.logConnectionRetryEvent()

        if self.enable_cache:
            self._hold_request_tree = TreeCache(self.client,
                                                self.HOLD_REQUEST_ROOT)
            self._hold_request_tree.listen_fault(self.cacheFaultListener)
            self._hold_request_tree.listen(self.holdRequestCacheListener)
            self._hold_request_tree.start()

    def cacheFaultListener(self, e):
        self.log.exception(e)

    def holdRequestCacheListener(self, event):
        '''
        Keep the hold request object cache in sync with the TreeCache.
        '''
        try:
            self._holdRequestCacheListener(event)
        except Exception:
            self.log.exception(
                "Exception in hold request cache update for event: %s", event)

    def _holdRequestCacheListener(self, event):
        if hasattr(event.event_data, 'path'):
            # Ignore root node
            path = event.event_data.path
            if path == self.HOLD_REQUEST_ROOT:
                return

        if event.event_type not in (TreeEvent.NODE_ADDED,
                                    TreeEvent.NODE_UPDATED,
                                    TreeEvent.NODE_REMOVED):
            return

        path = event.event_data.path
        request_id = path.rsplit('/', 1)[1]

        if event.event_type in (TreeEvent.NODE_ADDED, TreeEvent.NODE_UPDATED):
            # Requests with no data are invalid
            if not event.event_data.data:
                return

            # Perform an in-place update of the already cached request
            d = self._bytesToDict(event.event_data.data)
            old_request = self._cached_hold_requests.get(request_id)
            if old_request:
                if event.event_data.stat.version <= old_request.stat.version:
                    # Don't update to older data
                    return
                old_request.updateFromDict(d)
                old_request.stat = event.event_data.stat
            else:
                request = zuul.model.HoldRequest.fromDict(d)
                request.id = request_id
                request.stat = event.event_data.stat
                self._cached_hold_requests[request_id] = request

        elif event.event_type == TreeEvent.NODE_REMOVED:
            try:
                del self._cached_hold_requests[request_id]
            except KeyError:
                pass

    def disconnect(self):
        '''
        Close the ZooKeeper cluster connection.

        You should call this method if you used connect() to establish a
        cluster connection.
        '''
        if self._hold_request_tree is not None:
            self._hold_request_tree.close()
            self._hold_request_tree = None

        if self.client is not None and self.client.connected:
            self.client.stop()
            self.client.close()
            self.client = None

    def resetHosts(self, hosts):
        '''
        Reset the ZooKeeper cluster connection host list.

        :param str hosts: Comma-separated list of hosts to connect to (e.g.
            127.0.0.1:2181,127.0.0.1:2182,[::1]:2183).
        '''
        if self.client is not None:
            self.client.set_hosts(hosts=hosts)

    def submitNodeRequest(self, node_request, watcher):
        '''
        Submit a request for nodes to Nodepool.

        :param NodeRequest node_request: A NodeRequest with the
            contents of the request.

        :param callable watcher: A callable object that will be
            invoked each time the request is updated.  It is called
            with two arguments: (node_request, deleted) where
            node_request is the same argument passed to this method,
            and deleted is a boolean which is True if the node no
            longer exists (notably, this will happen on disconnection
            from ZooKeeper).  The watcher should return False when
            further updates are no longer necessary.
        '''
        node_request.created_time = time.time()
        data = node_request.toDict()

        path = '{}/{:0>3}-'.format(self.REQUEST_ROOT, node_request.priority)
        path = self.client.create(path,
                                  self._dictToStr(data),
                                  makepath=True,
                                  sequence=True,
                                  ephemeral=True)
        reqid = path.split("/")[-1]
        node_request.id = reqid

        def callback(data, stat):
            if data:
                self.updateNodeRequest(node_request, data)
            deleted = (data is None)  # data *are* none
            return watcher(node_request, deleted)

        self.client.DataWatch(path, callback)

    def deleteNodeRequest(self, node_request):
        '''
        Delete a request for nodes.

        :param NodeRequest node_request: A NodeRequest with the
            contents of the request.
        '''

        path = '%s/%s' % (self.REQUEST_ROOT, node_request.id)
        try:
            self.client.delete(path)
        except kze.NoNodeError:
            pass

    def nodeRequestExists(self, node_request):
        '''
        See if a NodeRequest exists in ZooKeeper.

        :param NodeRequest node_request: A NodeRequest to verify.

        :returns: True if the request exists, False otherwise.
        '''
        path = '%s/%s' % (self.REQUEST_ROOT, node_request.id)
        if self.client.exists(path):
            return True
        return False

    def storeNodeRequest(self, node_request):
        '''Store the node request.

        The request is expected to already exist and is updated in its
        entirety.

        :param NodeRequest node_request: The request to update.
        '''

        path = '%s/%s' % (self.REQUEST_ROOT, node_request.id)
        self.client.set(path, self._dictToStr(node_request.toDict()))

    def updateNodeRequest(self, node_request, data=None):
        '''Refresh an existing node request.

        :param NodeRequest node_request: The request to update.
        :param dict data: The data to use; query ZK if absent.
        '''
        if data is None:
            path = '%s/%s' % (self.REQUEST_ROOT, node_request.id)
            data, stat = self.client.get(path)
        data = self._strToDict(data)
        request_nodes = list(node_request.nodeset.getNodes())
        for i, nodeid in enumerate(data.get('nodes', [])):
            request_nodes[i].id = nodeid
            self.updateNode(request_nodes[i])
        node_request.updateFromDict(data)

    def storeNode(self, node):
        '''Store the node.

        The node is expected to already exist and is updated in its
        entirety.

        :param Node node: The node to update.
        '''

        path = '%s/%s' % (self.NODE_ROOT, node.id)
        self.client.set(path, self._dictToStr(node.toDict()))

    def updateNode(self, node):
        '''Refresh an existing node.

        :param Node node: The node to update.
        '''

        node_path = '%s/%s' % (self.NODE_ROOT, node.id)
        node_data, node_stat = self.client.get(node_path)
        node_data = self._strToDict(node_data)
        node.updateFromDict(node_data)

    def lockNode(self, node, blocking=True, timeout=None):
        '''
        Lock a node.

        This should be called as soon as a request is fulfilled and
        the lock held for as long as the node is in-use.  It can be
        used by nodepool to detect if Zuul has gone offline and the
        node should be reclaimed.

        :param Node node: The node which should be locked.
        '''

        lock_path = '%s/%s/lock' % (self.NODE_ROOT, node.id)
        try:
            lock = Lock(self.client, lock_path)
            have_lock = lock.acquire(blocking, timeout)
        except kze.LockTimeout:
            raise LockException("Timeout trying to acquire lock %s" %
                                lock_path)

        # If we aren't blocking, it's possible we didn't get the lock
        # because someone else has it.
        if not have_lock:
            raise LockException("Did not get lock on %s" % lock_path)

        node.lock = lock

    def unlockNode(self, node):
        '''
        Unlock a node.

        The node must already have been locked.

        :param Node node: The node which should be unlocked.
        '''

        if node.lock is None:
            raise LockException("Node %s does not hold a lock" % (node, ))
        node.lock.release()
        node.lock = None

    def lockNodeRequest(self, request, blocking=True, timeout=None):
        '''
        Lock a node request.

        This will set the `lock` attribute of the request object when the
        lock is successfully acquired.

        :param NodeRequest request: The request to lock.
        :param bool blocking: Whether or not to block on trying to
            acquire the lock
        :param int timeout: When blocking, how long to wait for the lock
            to get acquired. None, the default, waits forever.

        :raises: TimeoutException if we failed to acquire the lock when
            blocking with a timeout. ZKLockException if we are not blocking
            and could not get the lock, or a lock is already held.
        '''

        path = "%s/%s" % (self.REQUEST_LOCK_ROOT, request.id)
        try:
            lock = Lock(self.client, path)
            have_lock = lock.acquire(blocking, timeout)
        except kze.LockTimeout:
            raise LockException("Timeout trying to acquire lock %s" % path)
        except kze.NoNodeError:
            have_lock = False
            self.log.error("Request not found for locking: %s", request)

        # If we aren't blocking, it's possible we didn't get the lock
        # because someone else has it.
        if not have_lock:
            raise LockException("Did not get lock on %s" % path)

        request.lock = lock
        self.updateNodeRequest(request)

    def unlockNodeRequest(self, request):
        '''
        Unlock a node request.

        The request must already have been locked.

        :param NodeRequest request: The request to unlock.

        :raises: ZKLockException if the request is not currently locked.
        '''
        if request.lock is None:
            raise LockException("Request %s does not hold a lock" % request)
        request.lock.release()
        request.lock = None

    def heldNodeCount(self, autohold_key):
        '''
        Count the number of nodes being held for the given tenant/project/job.

        :param set autohold_key: A set with the tenant/project/job names.
        '''
        identifier = " ".join(autohold_key)
        try:
            nodes = self.client.get_children(self.NODE_ROOT)
        except kze.NoNodeError:
            return 0

        count = 0
        for nodeid in nodes:
            node_path = '%s/%s' % (self.NODE_ROOT, nodeid)
            try:
                node_data, node_stat = self.client.get(node_path)
            except kze.NoNodeError:
                # Node got removed on us. Just ignore.
                continue

            if not node_data:
                self.log.warning("Node ID %s has no data", nodeid)
                continue
            node_data = self._strToDict(node_data)
            if (node_data['state'] == zuul.model.STATE_HOLD
                    and node_data.get('hold_job') == identifier):
                count += 1
        return count

    # Copy of nodepool/zk.py begins here
    NODE_ROOT = "/nodepool/nodes"
    LAUNCHER_ROOT = "/nodepool/launchers"

    def _bytesToDict(self, data):
        return json.loads(data.decode('utf8'))

    def _launcherPath(self, launcher):
        return "%s/%s" % (self.LAUNCHER_ROOT, launcher)

    def _nodePath(self, node):
        return "%s/%s" % (self.NODE_ROOT, node)

    def getRegisteredLaunchers(self):
        '''
        Get a list of all launchers that have registered with ZooKeeper.

        :returns: A list of Launcher objects, or empty list if none are found.
        '''
        try:
            launcher_ids = self.client.get_children(self.LAUNCHER_ROOT)
        except kze.NoNodeError:
            return []

        objs = []
        for launcher in launcher_ids:
            path = self._launcherPath(launcher)
            try:
                data, _ = self.client.get(path)
            except kze.NoNodeError:
                # launcher disappeared
                continue

            objs.append(Launcher.fromDict(self._bytesToDict(data)))
        return objs

    def getNodes(self):
        '''
        Get the current list of all nodes.

        :returns: A list of nodes.
        '''
        try:
            return self.client.get_children(self.NODE_ROOT)
        except kze.NoNodeError:
            return []

    def getNode(self, node):
        '''
        Get the data for a specific node.

        :param str node: The node ID.

        :returns: The node data, or None if the node was not found.
        '''
        path = self._nodePath(node)
        try:
            data, stat = self.client.get(path)
        except kze.NoNodeError:
            return None
        if not data:
            return None

        d = self._bytesToDict(data)
        d['id'] = node
        return d

    def nodeIterator(self):
        '''
        Utility generator method for iterating through all nodes.
        '''
        for node_id in self.getNodes():
            node = self.getNode(node_id)
            if node:
                yield node

    def getHoldRequests(self):
        '''
        Get the current list of all hold requests.
        '''
        try:
            return sorted(self.client.get_children(self.HOLD_REQUEST_ROOT))
        except kze.NoNodeError:
            return []

    def getHoldRequest(self, hold_request_id):
        path = self.HOLD_REQUEST_ROOT + "/" + hold_request_id
        try:
            data, stat = self.client.get(path)
        except kze.NoNodeError:
            return None
        if not data:
            return None

        obj = zuul.model.HoldRequest.fromDict(self._strToDict(data))
        obj.id = hold_request_id
        obj.stat = stat
        return obj

    def storeHoldRequest(self, hold_request):
        '''
        Create or update a hold request.

        If this is a new request with no value for the `id` attribute of the
        passed in request, then `id` will be set with the unique request
        identifier after successful creation.

        :param HoldRequest hold_request: Object representing the hold request.
        '''
        if hold_request.id is None:
            path = self.client.create(self.HOLD_REQUEST_ROOT + "/",
                                      value=hold_request.serialize(),
                                      sequence=True,
                                      makepath=True)
            hold_request.id = path.split('/')[-1]
        else:
            path = self.HOLD_REQUEST_ROOT + "/" + hold_request.id
            self.client.set(path, hold_request.serialize())

    def _markHeldNodesAsUsed(self, hold_request):
        '''
        Changes the state for each held node for the hold request to 'used'.

        :returns: True if all nodes marked USED, False otherwise.
        '''
        def getHeldNodeIDs(request):
            node_ids = []
            for data in request.nodes:
                # TODO(Shrews): Remove type check at some point.
                # When autoholds were initially changed to be stored in ZK,
                # the node IDs were originally stored as a list of strings.
                # A later change embedded them within a dict. Handle both
                # cases here to deal with the upgrade.
                if isinstance(data, dict):
                    node_ids += data['nodes']
                else:
                    node_ids.append(data)
            return node_ids

        failure = False
        for node_id in getHeldNodeIDs(hold_request):
            node = self.getNode(node_id)
            if not node or node['state'] == zuul.model.STATE_USED:
                continue

            node['state'] = zuul.model.STATE_USED

            name = None
            label = None
            if 'name' in node:
                name = node['name']
            if 'label' in node:
                label = node['label']

            node_obj = zuul.model.Node(name, label)
            node_obj.updateFromDict(node)

            try:
                self.lockNode(node_obj, blocking=False)
                self.storeNode(node_obj)
            except Exception:
                self.log.exception(
                    "Cannot change HELD node state to USED "
                    "for node %s in request %s", node_obj.id, hold_request.id)
                failure = True
            finally:
                try:
                    if node_obj.lock:
                        self.unlockNode(node_obj)
                except Exception:
                    self.log.exception(
                        "Failed to unlock HELD node %s for request %s",
                        node_obj.id, hold_request.id)

        return not failure

    def deleteHoldRequest(self, hold_request):
        '''
        Delete a hold request.

        :param HoldRequest hold_request: Object representing the hold request.
        '''
        if not self._markHeldNodesAsUsed(hold_request):
            self.log.info(
                "Unable to delete hold request %s because "
                "not all nodes marked as USED.", hold_request.id)
            return

        path = self.HOLD_REQUEST_ROOT + "/" + hold_request.id
        try:
            self.client.delete(path, recursive=True)
        except kze.NoNodeError:
            pass

    def lockHoldRequest(self, request, blocking=True, timeout=None):
        '''
        Lock a node request.

        This will set the `lock` attribute of the request object when the
        lock is successfully acquired.

        :param HoldRequest request: The hold request to lock.
        '''
        if not request.id:
            raise LockException(
                "Hold request without an ID cannot be locked: %s" % request)

        path = "%s/%s/lock" % (self.HOLD_REQUEST_ROOT, request.id)
        try:
            lock = Lock(self.client, path)
            have_lock = lock.acquire(blocking, timeout)
        except kze.LockTimeout:
            raise LockException("Timeout trying to acquire lock %s" % path)

        # If we aren't blocking, it's possible we didn't get the lock
        # because someone else has it.
        if not have_lock:
            raise LockException("Did not get lock on %s" % path)

        request.lock = lock

    def unlockHoldRequest(self, request):
        '''
        Unlock a hold request.

        The request must already have been locked.

        :param HoldRequest request: The request to unlock.

        :raises: ZKLockException if the request is not currently locked.
        '''
        if request.lock is None:
            raise LockException("Request %s does not hold a lock" % request)
        request.lock.release()
        request.lock = None
Exemple #10
0
class ZookeeperStorage:
    """A low level storage object.

    Manages and publishes the zookeeper connection.

    Manages the database "schema" and allows access to multiple "groups"
    database servers, each representing one logical cluster.
    """

    _zk = None

    def __init__(self, connection_string, path, timeout=10.0):
        self._connection_string = connection_string
        self._path_prefix = path
        self._timeout = timeout
        if not self._path_prefix.endswith('/'):
            self._path_prefix += '/'
        self._watchers = {}
        self._loop = asyncio.get_event_loop()

    @property
    def connection(self):
        if self._zk is None:
            self._zk = KazooClient(
                    hosts=self._connection_string,
                    timeout=self._timeout)
        return self._zk

    def dcs_connect(self):
        self.connection.start()

    def dcs_disconnect(self):
        self._zk.stop()
        self._zk = None

    def _dict_watcher(self, group, what, callback):
        def hook(state, key, from_val, to_val):
            callback(_get_clusters(state))
        path = self._folder_path(what)
        prefix = group and group + '-' or group
        try:
            watch = DictWatch(self._zk, path, hook, prefix=prefix)
        except kazoo.exceptions.NoNodeError:
            self._zk.create(path, makepath=True)
            return self._dict_watcher(group, what, callback)
        self._watchers[id(watch)] = watch
        return watch

    def _listen_connection(self, state):
        self._connection_state_changes.append(state)
        self._loop.call_soon_threadsafe(self._consume_connection_state_changes)

    def dcs_watch_conn_info(self, callback, group=None):
        self._dict_watcher(group, 'conn', callback)

    def dcs_watch_state(self, callback, group=None):
        self._dict_watcher(group, 'state', callback)

    def _folder_path(self, folder):
        return self._path_prefix + folder

    def _path(self, group, folder, key):
        return self._path_prefix + folder + '/' + group + '-' + key

    def _get_static(self, group, key):
        path = self._path(group, 'static', key)
        try:
            data, stat = self._zk.get(path)
        except kazoo.exceptions.NoNodeError:
            return None
        return data

    def _set_static(self, group, key, data, overwrite=False):
        path = self._path(group, 'static', key)
        try:
            self._zk.create(path, data, makepath=True)
        except kazoo.exceptions.NodeExistsError:
            if overwrite:
                self._zk.set(path, data)
                return True
            return False
        return True

    def dcs_get_timeline(self, group):
        data = self._get_static(group, 'timeline')
        if data is None:
            data = b'0'
        return int(data.decode('ascii'))

    def dcs_set_timeline(self, group, timeline):
        assert isinstance(timeline, int)
        existing = self.dcs_get_timeline(group)
        if existing > timeline:
            raise ValueError('Timelines can only increase.')
        timeline = str(timeline).encode('ascii')
        self._set_static(group, 'timeline', timeline, overwrite=True)

    def dcs_set_database_identifier(self, group, database_id):
        database_id = database_id.encode('ascii')
        return self._set_static(group, 'database_identifier', database_id)

    def dcs_get_database_identifier(self, group):
        data = self._get_static(group, 'database_identifier')
        if data is not None:
            data = data.decode('ascii')
        return data

    def dcs_get_lock_owner(self, group, name):
        path = self._path(group, 'lock', name)
        try:
            existing_data, stat = self._zk.get(path)
        except kazoo.exceptions.NoNodeError:
            return None
        return existing_data.decode('utf-8')

    def dcs_unlock(self, group, name, owner):
        existing_owner = self.dcs_get_lock_owner(group, name)
        if existing_owner == owner:
            path = self._path(group, 'lock', name)
            self._zk.delete(path)

    def dcs_lock(self, group, name, owner):
        data = owner.encode('utf-8')
        path = self._path(group, 'lock', name)
        try:
            self._zk.create(path, data, ephemeral=True, makepath=True)
            return 'locked'
        except kazoo.exceptions.NodeExistsError:
            pass
        # lock exists, do we have it, can we break it?
        try:
            existing_data, stat = self._zk.get(path)
        except kazoo.exceptions.NoNodeError:
            # lock broke while we were looking at it
            # try get it again
            return self.dcs_lock(group, name, owner)
        if stat.owner_session_id == self._zk.client_id[0]:
            # we already own the lock
            return 'owned'
        elif data == existing_data:
            # it is our log, perhaps I am restarting. of there are 2 of me running!
            try:
                self._zk.delete(path, version=stat.version)
            except (kazoo.exceptions.NoNodeError, kazoo.exceptions.BadVersionError):
                # lock broke while we were looking at it
                pass
            # try get the lock again
            result = self.dcs_lock(group, name, owner)
            if result == 'locked':
                return 'broken'
            return result
        return 'failed'

    def dcs_watch_lock(self, name, group, callback):
        loop = asyncio.get_event_loop()
        def handler(data, stat, event):
            if data is not None:
                data = data.decode('utf-8')
            callback(data)
        path = self._path(group, 'lock', name)
        w = self._zk.DataWatch(path, partial(loop.call_soon_threadsafe, handler))
        self._watchers[id(w)] = w

    def dcs_get_database_identifiers(self):
        wanted_info_name = 'database_identifier'
        dirpath = self._folder_path('static')
        try:
            children = self._zk.get_children(dirpath)
        except kazoo.exceptions.NoNodeError:
            return {}
        result = {}
        for name in children:
            owner, info_name = name.split('-', 1)
            if wanted_info_name != info_name:
                continue
            try:
                data, state = self._zk.get(dirpath + '/' + name)
            except kazoo.exceptions.NoNodeError:
                continue
            state = json.loads(data.decode('ascii'))
            result[owner] = state
        return result

    def dcs_watch_database_identifiers(self, callback):
        name = 'database_identifier'
        def handler(state, key, from_val, to_val):
            # this is probably more complex than it needs to be!
            c_state = _get_clusters(state)
            new_state = {}
            for k, v in c_state.items():
                ours = v.get(name, None)
                if ours is not None:
                    new_state[k] = ours
            callback(new_state)
        dirpath = self._folder_path('static')
        watch = DictWatch(
                self._zk,
                dirpath,
                handler,
                deserializer=lambda data: data.decode('utf-8'))
        self._watchers[id(watch)] = watch

    def dcs_watch_locks(self, name, callback):
        def handler(state, key, from_val, to_val):
            # this is probably more complex than it needs to be!
            c_state = _get_clusters(state)
            new_state = {}
            for k, v in c_state.items():
                ours = v.get(name, None)
                if ours is not None:
                    new_state[k] = ours
            callback(new_state)
        dirpath = self._folder_path('lock')
        watch = DictWatch(
                self._zk,
                dirpath,
                handler,
                deserializer=lambda data: data.decode('utf-8'))
        self._watchers[id(watch)] = watch

    def _set_info(self, group, type, owner, data):
        path = self._path(group, type, owner)
        data = json.dumps(data)
        data = data.encode('ascii')
        try:
            stat = self._zk.set(path, data)
            how = 'existing'
        except kazoo.exceptions.NoNodeError:
            how = 'create'
            stat = None
        if stat is not None and stat.owner_session_id != self._zk.client_id[0]:
            self._zk.delete(path)
            how = 'takeover'
            stat = None
        if stat is None:
            self._zk.create(path, data, ephemeral=True, makepath=True)
        return how

    def dcs_set_conn_info(self, group, owner, data):
        return self._set_info(group, 'conn', owner, data)

    def dcs_set_state(self, group, owner, data):
        return self._set_info(group, 'state', owner, data)

    def _get_all_info(self, group, type):
        dirpath = self._folder_path(type)
        try:
            children = self._zk.get_children(dirpath)
        except kazoo.exceptions.NoNodeError:
            return iter([])
        for name in children:
            this_group, owner = name.split('-', 1)
            if group is not None and this_group != group:
                continue
            data, state = self._zk.get(dirpath + '/' + name)
            state = json.loads(data.decode('ascii'))
            yield owner, state

    def dcs_list_conn_info(self, group=None):
        return list(self._get_all_info(group, 'conn'))

    def dcs_list_state(self, group=None):
        return list(self._get_all_info(group, 'state'))

    def dcs_delete_conn_info(self, group, owner):
        path = self._path(group, 'conn', owner)
        try:
            self._zk.delete(path)
        except kazoo.exceptions.NoNodeError:
            pass
class AnalyticsDiscovery(gevent.Greenlet):
    def _sandesh_connection_info_update(self, status, message):

        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER,
                               name=self._svc_name,
                               status=new_conn_state,
                               message=message,
                               server_addrs=self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN
                and new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' % (message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state
                and new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state
        #import pdb; pdb.set_trace()

    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Analytics Discovery listen %s" % str(state))
        if state == KazooState.CONNECTED:
            if self._conn_state != ConnectionStatus.UP:
                self._sandesh_connection_info_update(status='UP', message='')
                self._logger.error("Analytics Discovery to publish %s" %
                                   str(self._pubinfo))
                self._reconnect = True
            else:
                self._logger.error("Analytics Discovery already connected")
        else:
            self._logger.error("Analytics Discovery NOT connected")
            if self._conn_state == ConnectionStatus.UP:
                self._sandesh_connection_info_update(status='DOWN', message='')

    def _zk_datawatch(self, watcher, child, data, stat, event="unknown"):
        self._logger.error(\
                "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                (watcher, child, data, event))
        if data:
            data_dict = json.loads(data)
            self._wchildren[watcher][child] = OrderedDict(
                sorted(data_dict.items()))
        else:
            if child in self._wchildren[watcher]:
                del self._wchildren[watcher][child]
        if self._watchers[watcher]:
            self._watchers[watcher](sorted(self._wchildren[watcher].values()))

    def _zk_watcher(self, watcher, children):
        self._logger.error("Analytics Discovery Children %s" % children)
        self._reconnect = True

    def __init__(self,
                 logger,
                 zkservers,
                 svc_name,
                 inst,
                 watchers={},
                 zpostfix="",
                 freq=10):
        gevent.Greenlet.__init__(self)
        self._svc_name = svc_name
        self._inst = inst
        self._zk_server = zkservers
        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')
        self._zk = KazooClient(hosts=zkservers)
        self._pubinfo = None
        self._watchers = watchers
        self._wchildren = {}
        self._zpostfix = zpostfix
        self._basepath = "/analytics-discovery-" + self._zpostfix
        self._reconnect = None
        self._freq = freq

    def publish(self, pubinfo):
        self._pubinfo = pubinfo
        #import pdb; pdb.set_trace()
        if self._conn_state == ConnectionStatus.UP:
            try:
                self._logger.error("ensure %s" %
                                   (self._basepath + "/" + self._svc_name))
                self._logger.error("zk state %s (%s)" %
                                   (self._zk.state, self._zk.client_state))
                self._zk.ensure_path(self._basepath + "/" + self._svc_name)
                self._logger.error("check for %s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))
                if pubinfo is not None:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._zk.set("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo)
                    else:
                        self._zk.create("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo, ephemeral=True)
                else:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._logger.error("withdrawing published info!")
                        self._zk.delete("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))

            except Exception as ex:
                template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                self._sandesh_connection_info_update(status='DOWN', message='')
                self._reconnect = True
        else:
            self._logger.error("Analytics Discovery cannot publish while down")

    def _run(self):
        while True:
            try:
                self._zk.start()
                break
            except gevent.event.Timeout as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                gevent.sleep(1)
                # Zookeeper is also throwing exception due to delay in master election
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                gevent.sleep(1)

        try:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
            self._reconnect = False
            # Done connecting to ZooKeeper

            self._zk.add_listener(self._zk_listen)
            for wk in self._watchers.keys():
                self._zk.ensure_path(self._basepath + "/" + wk)
                self._wchildren[wk] = {}
                self._zk.ChildrenWatch(self._basepath + "/" + wk,
                                       partial(self._zk_watcher, wk))

            # Trigger the initial publish
            self._reconnect = True

            while True:
                try:
                    # If a reconnect happens during processing, don't lose it
                    while self._reconnect:
                        self._logger.error("Analytics Discovery %s reconnect" \
                                % self._svc_name)
                        self._reconnect = False
                        self.publish(self._pubinfo)

                        for wk in self._watchers.keys():
                            self._zk.ensure_path(self._basepath + "/" + wk)
                            children = self._zk.get_children(self._basepath +
                                                             "/" + wk)

                            old_children = set(self._wchildren[wk].keys())
                            new_children = set(children)

                            # Remove contents for the children who are gone
                            # (DO NOT remove the watch)
                            for elem in old_children - new_children:
                                del self._wchildren[wk][elem]

                            # Overwrite existing children, or create new ones
                            for elem in new_children:
                                # Create a watch for new children
                                if elem not in self._wchildren[wk]:
                                    self._zk.DataWatch(self._basepath + "/" + \
                                            wk + "/" + elem,
                                            partial(self._zk_datawatch, wk, elem))

                                data_str, _ = self._zk.get(\
                                        self._basepath + "/" + wk + "/" + elem)
                                data_dict = json.loads(data_str)
                                self._wchildren[wk][elem] = \
                                        OrderedDict(sorted(data_dict.items()))

                                self._logger.error(\
                                    "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                                    (wk, elem, self._wchildren[wk][elem], "GET"))
                            if self._watchers[wk]:
                                self._watchers[wk](sorted(
                                    self._wchildren[wk].values()))

                    gevent.sleep(self._freq)
                except gevent.GreenletExit:
                    self._logger.error("Exiting AnalyticsDiscovery for %s" % \
                            self._svc_name)
                    self._zk.stop()
                    break

                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                    self._reconnect = True

        except Exception as ex:
            template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}"
            messag = template.format(type(ex).__name__, ex.args)
            self._logger.error("%s : traceback %s for %s info %s" % \
                    (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
            raise SystemExit
Exemple #12
0
class ZooKeeper(object):
    # Constants used by the REST API:
    LIVE_NODES_ZKNODE = "/live_nodes"
    ALIASES = "/aliases.json"
    CLUSTER_STATE = "/clusterstate.json"
    COLLECTION_STATUS = "/collections"
    COLLECTION_STATE = "/collections/%s/state.json"
    SHARDS = "shards"
    REPLICAS = "replicas"
    STATE = "state"
    ACTIVE = "active"
    LEADER = "leader"
    BASE_URL = "base_url"
    TRUE = "true"
    FALSE = "false"
    COLLECTION = "collection"

    def __init__(self,
                 zkServerAddress,
                 timeout=15,
                 max_retries=-1,
                 kazoo_client=None):
        if KazooClient is None:
            logging.error(
                "ZooKeeper requires the `kazoo` library to be installed")
            raise RuntimeError

        self.collections = {}
        self.liveNodes = {}
        self.aliases = {}
        self.state = None

        if kazoo_client is None:
            self.zk = KazooClient(
                zkServerAddress,
                read_only=True,
                timeout=timeout,
                command_retry={"max_tries": max_retries},
                connection_retry={"max_tries": max_retries},
            )
        else:
            self.zk = kazoo_client

        self.zk.start()

        def connectionListener(state):
            if state == KazooState.LOST:
                self.state = state
            elif state == KazooState.SUSPENDED:
                self.state = state

        self.zk.add_listener(connectionListener)

        @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE)
        def watchClusterState(data, *args, **kwargs):
            if not data:
                logger.warning(
                    "No cluster state available: no collections defined?")
            else:
                self.collections = json.loads(data.decode("utf-8"))
                logger.info("Updated collections: %s", self.collections)

        @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE)
        def watchLiveNodes(children):
            self.liveNodes = children
            logger.info("Updated live nodes: %s", children)

        @self.zk.DataWatch(ZooKeeper.ALIASES)
        def watchAliases(data, stat):
            if data:
                json_data = json.loads(data.decode("utf-8"))
                if ZooKeeper.COLLECTION in json_data:
                    self.aliases = json_data[ZooKeeper.COLLECTION]
                else:
                    logger.warning(
                        "Expected to find %s in alias update %s",
                        ZooKeeper.COLLECTION,
                        json_data.keys(),
                    )
            else:
                self.aliases = None
            logger.info("Updated aliases: %s", self.aliases)

        def watchCollectionState(data, *args, **kwargs):
            if not data:
                logger.warning(
                    "No cluster state available: no collections defined?")
            else:
                self.collections.update(json.loads(data.decode("utf-8")))
                logger.info("Updated collections: %s", self.collections)

        @self.zk.ChildrenWatch(ZooKeeper.COLLECTION_STATUS)
        def watchCollectionStatus(children):
            logger.info("Updated collection: %s", children)
            for c in children:
                self.zk.DataWatch(self.COLLECTION_STATE % c,
                                  watchCollectionState)

    def getHosts(self, collname, only_leader=False, seen_aliases=None):
        if self.aliases and collname in self.aliases:
            return self.getAliasHosts(collname, only_leader, seen_aliases)

        hosts = []
        if collname not in self.collections:
            raise SolrError("Unknown collection: %s" % collname)
        collection = self.collections[collname]
        shards = collection[ZooKeeper.SHARDS]
        for shardname in shards.keys():
            shard = shards[shardname]
            if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE:
                replicas = shard[ZooKeeper.REPLICAS]
                for replicaname in replicas.keys():
                    replica = replicas[replicaname]

                    if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE:
                        if not only_leader or (replica.get(
                                ZooKeeper.LEADER, None) == ZooKeeper.TRUE):
                            base_url = replica[ZooKeeper.BASE_URL]
                            if base_url not in hosts:
                                hosts.append(base_url)
        return hosts

    def getAliasHosts(self, collname, only_leader, seen_aliases):
        if seen_aliases:
            if collname in seen_aliases:
                logger.warning("%s in circular alias definition - ignored",
                               collname)
                return []
        else:
            seen_aliases = []
        seen_aliases.append(collname)
        collections = self.aliases[collname].split(",")
        hosts = []
        for collection in collections:
            for host in self.getHosts(collection, only_leader, seen_aliases):
                if host not in hosts:
                    hosts.append(host)
        return hosts

    def getRandomURL(self, collname, only_leader=False):
        hosts = self.getHosts(collname, only_leader=only_leader)
        if not hosts:
            raise SolrError("ZooKeeper returned no active shards!")
        return "%s/%s" % (random.choice(hosts), collname)  # NOQA: B311

    def getLeaderURL(self, collname):
        return self.getRandomURL(collname, only_leader=True)
Exemple #13
0
class watcher:
    def __init__(self, hostName, displayListSize):
        self.host = hostName
        self.maxDisplaySize = displayListSize
        self.zk = KazooClient(hosts=self.host)
        self.zk.start()
        self.zk.DataWatch("/gameData/recentScores", self.displayScores)
        self.zk.DataWatch("/gameData/activeUsers", self.displayScores)
        atexit.register(self.cleanup)

    def displayScores(self, var1, var2):
        self.displayRecentScores()
        self.displayHighestScores()

    def activeUserList(self):
        self.zk.ensure_path(path="/gameData")
        if not self.zk.exists(path="/gameData/activeUsers"):
            self.zk.create(path="/gameData/activeUsers",
                           value=pickle.dumps([]))
        activeUsersZkObj, _ = self.zk.get(path="/gameData/activeUsers")
        activeUsersObj = pickle.loads(activeUsersZkObj)
        return activeUsersObj

    def displayRecentScores(self):
        self.zk.ensure_path(path="/gameData")
        if not self.zk.exists(path="/gameData/recentScores"):
            self.zk.create(path="/gameData/recentScores",
                           value=pickle.dumps([]))
        activeUsersZkObj, _ = self.zk.get(path="/gameData/recentScores")
        activeUsersObj = pickle.loads(activeUsersZkObj)
        print "\nMost recent scores"
        print "------------------"
        activeUsersList = self.activeUserList()
        if len(activeUsersObj) > self.maxDisplaySize:
            activeUsersObj = activeUsersObj[len(activeUsersObj) - self.
                                            maxDisplaySize:len(activeUsersObj)]
        for recentScore in activeUsersObj:
            outputStr = "%s \t\t %d" % (recentScore[0], recentScore[1])
            if recentScore[0] in activeUsersList:
                outputStr += " **"
            print outputStr

    def displayHighestScores(self):
        self.zk.ensure_path(path="/gameData")
        if not self.zk.exists(path="/gameData/maxScores"):
            self.zk.create(path="/gameData/maxScores", value=pickle.dumps([]))
        activeUsersZkObj, _ = self.zk.get(path="/gameData/maxScores")
        activeUsersObj = pickle.loads(activeUsersZkObj)
        print "\nHighest scores"
        print "------------------"
        activeUsersList = self.activeUserList()
        if len(activeUsersObj) > self.maxDisplaySize:
            activeUsersObj = activeUsersObj[0:self.maxDisplaySize]
        for highestScore in activeUsersObj:
            outputStr = "%s \t %d" % (highestScore[0], highestScore[1])
            if highestScore[0] in activeUsersList:
                outputStr += " **"
            print outputStr

    def cleanup(self):
        self.zk.stop()
Exemple #14
0
class Server(threading.Thread):
    '''
    工作服务器(也是ZooKeeper的客户端)
    '''
    # 控制输出信息的锁,注意:这个是单机器的锁,这里实现的是分布式锁,并不存在本末倒置
    print_mutex = threading.Lock()
    
    DELAY_TIME = 3
    
    def __init__(self, zk_server_address, lock_base_path, host, serve_mode):
        threading.Thread.__init__(self)
        # 锁的根节点路径
        self.lock_base_path = lock_base_path
        # 主机IP
        self.host = host
        # 工作模式,读/写
        self.serve_mode = serve_mode
        # 事件,初始化为False
        self.event = threading.Event()
        
        # 创建一个zookeeper客户端
        self.zkclient = KazooClient(zk_server_address)
        # 添加连接状态监听器
        self.zkclient.add_listener(self.zk_connect_listener)
        # 与zookeeper开启连接
        self.zkclient.start()
        
    
    # 连接状态监听器
    def zk_connect_listener(self, state):
        # 获取打印锁
        Server.print_mutex.acquire()
        if state == KeeperState.CONNECTED:
            print self.host + " 已经开启..."
        elif state == KazooState.LOST:
            print self.host + " 停止服务..."
        else:
            raise Exception(self.host + " 未正常开启...")   
        # 获取打印锁
        Server.print_mutex.release() 
      
        
    # 初始化
    def run(self):
        # 创建锁节点,形如/shared_lock/192.168.0.0-R-0000000001
        self.create_lock_node()
        # 获取锁
        self.acquire_lock()
        # 工作
        self.work()
        # 释放锁
        self.release_lock()
        # 准备停止
        self.stop()
        
        
    def create_lock_node(self):
        # 先检查父节点,如果父节点不存在
        if not self.zkclient.exists(self.lock_base_path):
            # 先创建父节点
            self.zkclient.create(self.lock_base_path)
        # 拼凑出服务器子节点的完整路径
        node_path = self.lock_base_path + "/" + self.host + "-" + self.serve_mode + "-"
        # 创建临时顺序节点
        self.node_path = self.zkclient.create(node_path, "", self.zkclient.default_acl, True, True)
    
    
    # 删除事件的响应
    def pre_node_delete_watch(self, data, stat, event):
        if event and event.type == EventType.DELETED:
            # 将事件设置为True
            self.event.set()
    
    
    # 获取锁
    def acquire_lock(self):
        # 提取出自己的节点名
        node_name = self.node_path.split("/")[-1]
        # 获取/shared_lock子节点排序列表
        sorted_children = self.get_sorted_children()
        # 得到节点的索引
        node_index = sorted_children.index(node_name)
        
        # 寻找最后一个写节点
        def get_last_write_node_index():
            # 逆向遍历
            for i in range(node_index)[::-1]:
                # 工作模式是节点名中的第二个部分
                serve_mode = sorted_children[i].split("-")[1]
                # 只要找到一个写请求,则立刻返回
                if serve_mode == "W":
                    return i
            # 如果全部都是读请求,则返回-1
            return -1
        
        # 如果是写请求,
        if self.serve_mode == "W":
            # 如果是,再判断自己是不是序号最小的节点
            if node_index == 0:
                # 立马返回,占用锁,开始写数据
                return
            # 如果不是,向比自己小的最后一个节点注册监听
            else:
                # 拼凑出前一个节点的路径
                pre_node_path = self.lock_base_path + "/" + sorted_children[node_index - 1]
                # 添加对前一个节点的删除事件的关注
                self.zkclient.DataWatch(pre_node_path, self.pre_node_delete_watch)
                # 这里应该等待锁
                self.event.wait()
        # 如果是读请求
        else:
            # 得到所有比自己小的子节点中的最后一个写节点的下标
            last_write_node_index = get_last_write_node_index()
            # 判断以下两个条件是否成立
            # 1)没有比自己序号小的子节点
            # 2)或是所有比自己小的子节点都是读请求
            # 如果成立
            if node_index == 0 or last_write_node_index < 0:
                # 立马返回,占用共享锁,开始读数据
                return
            # 如果不成立,向比自己小的最后一个写节点注册监听
            else:
                # 拼凑出前一个节点的路径
                pre_node_path = self.lock_base_path + "/" + sorted_children[last_write_node_index]
                # 添加对前一个节点的删除事件的关注
                self.zkclient.DataWatch(pre_node_path, self.pre_node_delete_watch)
                # 这里应该等待锁
                self.event.wait()
    
    
    def work(self):
        # 获取打印锁
        Server.print_mutex.acquire()
        # 如果是写请求,
        if self.serve_mode == "W":
            # 写一会数据,然后删除节点,关闭会话
            print self.host + " 正在写数据..."
        else:
            # 读一会数据,然后删除节点,关闭会话
            print self.host + " 正在读数据..."
        Server.print_mutex.release()
        # 这里暂停几秒钟。模拟工作耗时状态
        sleep(self.DELAY_TIME)
    
    
    # 释放锁
    def release_lock(self):
        # 删除自己的节点
        self.zkclient.delete(self.node_path)
    
    
    # 获取/shared_lock子节点排序列表
    def get_sorted_children(self):
        # 获取/shared_lock子节点列表
        children = self.zkclient.get_children(self.lock_base_path)
        ###############################################################
        # 这里sort函数的比较表达式是由两个函数实现,还挺有技巧的
        ###############################################################
        # 返回节点的序列号
        def get_lock_node_seq(node_name):
            # 分割字符串,然后返回列表最后一个元素,先将其转化为整型
            return string.atoi(node_name.split("-")[-1])
        # 编号比较r函数
        def sequence_compare(node1, node2):
            return get_lock_node_seq(node1) - get_lock_node_seq(node2)
        # 将列表排序
        children.sort(cmp = sequence_compare)
        
        return children
        
    
    # 停止工作
    def stop(self):
        # 移除事件监听器
        self.zkclient.remove_listener(self.pre_node_delete_watch)
        # 会话
        self.zkclient.stop()
        self.zkclient.close()    
    # argument parser setup
    parser = argparse.ArgumentParser(description='Script description TODO')
    parser.add_argument('application', metavar='"app"', help='application to run after creating znode /z')
    application_to_run = parser.parse_args().application
    print("Aplication to run when /z node exists is \"" + application_to_run + "\"")

    z_already_exists = False
    application_process = None
    servers_addresses = ['127.0.0.1:2181', '127.0.0.1:2182', '127.0.0.1:2183']
    hosts = ','.join(servers_addresses)

    # start kazoo cliend, then add state listener and /z node watcher 
    zk = KazooClient(hosts=hosts)
    zk.start()
    zk.add_listener(state_listener)
    zk.DataWatch("/z", watch_node)

    # handle user commands
    while True:
        command = input('Type the command ( tree | quit )\n')
        if command == 'tree':
            visualize_z_tree()
        elif command == 'quit':
            if z_already_exists:
                kill_app()
            break
        else:
            print('Incorrect command')

    zk.stop()
    
Exemple #16
0
class ZkClient:

    def __init__(self, zk_servers, app):
        self.app = app
        self.zk = KazooClient(hosts=zk_servers)
        self.zk.start()
        self.server_node_path = "/entry/service"
        self.node = "/entry/serviceinfo/node/loan_mng"
        self.os_center_node = "/entry/service/os_center/node"
        self.loan_mng_hosts = ""

        self.zk.DataWatch(self.node, self.get_loan_mng_hosts)
        self.zk.DataWatch(self.os_center_node, self.get_os_center_hosts)
        self.zk.DataWatch(self.server_node_path, self.get_servers_node)
        
    def get_loan_mng_hosts(self, *args):
        try:
            data = json.loads(self.zk.get(self.node)[0])
            ip = data["node_list"][0]["ip"]
            port = data["node_list"][0]["port"]
            host = "http://{}:{}".format(ip, port)
            logger.info("ZK | GET LOAN_MNG HOSTS | SUCCESS | HOST: {}".format(host))
            self.loan_mng_hosts = host
            return host
        except Exception as e:
            logger.info("ZK | GET LOAN_MNG HOSTS | FAILED | ERROR: {}".format(str(e)))
            self.loan_mng_hosts = ""

    def get_os_center_hosts(self, *args):
        try:
            children = self.zk.get_children(self.os_center_node)
            node = children[0]
            data = self.zk.get(self.os_center_node+"/"+node)[0].decode()
            host = "http://{}".format(data)
            logger.info("ZK | GET OS_CENTER HOSTS | SUCCESS | HOST: {}".format(host))
            return host
        except Exception as e:
            logger.info("ZK | GET OS_CENTER HOSTS | FAILED | ERROR: {}".format(str(e)))
            return ""

    def get_config(self, category):
        path = ConfigNameMap.zk_path[category]
        if not self.zk.exists(path):
            self.zk.create(path, json.dumps({}).encode())
        try:
            data = json.loads(self.zk.get(path)[0].decode())
            return data
        except Exception as e:
            logger.info("ZK | GET CONFIG | FAILED | CATEGORY: {}| ERROR: {}".format(category, str(e)))
            return {}

    def write_config(self, category, config):
        path = ConfigNameMap.zk_path[category]
        try:
            self.zk.ensure_path(path)
            if not self.zk.exists(path):
                self.zk.create(path, json.dumps({}).encode())
            self.zk.set(path, json.dumps(config).encode())
            return True
        except Exception as e:
            logger.info("ZK | SYNC CONFIG | FAILED | CATEGORY: {}| ERROR: {}".format(category, str(e)))
            return False

    def get_servers_node(self, *args):
        """
        获取所有服务的注册节点
        """
        servers_node = []

        def _get_childern(path):
            try:
                reg = self.zk.get_children(path)
                return reg
            except Exception as e:
                return []

        def _get_data(path):
            try:
                data = self.zk.get(path)[0].decode()
                return data
            except Exception as e:
                return None

        try:
            all_server = self.zk.get_children(self.server_node_path)
            for server_name in all_server:
                path = "{}/{}/node".format(self.server_node_path, server_name)
                registration = _get_childern(path)
                data = []
                for i in registration:
                    node_data = _get_data(path+"/"+i)
                    data.append(node_data)
                servers_node.append({
                    "name": server_name,
                    "node": list(set(data))
                })
            return servers_node
        except NoNodeError as e:
            logger.warn("NO NODE ERROR | NODE PATH {}".format(self.server_node_path))
            return []