Exemple #1
0
class Exhibitor:

    def __init__(self, exhibitor, chroot):
        self.chroot = chroot
        self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=30)
        self.client = KazooClient(hosts=self.exhibitor.zookeeper_hosts + self.chroot,
                                  command_retry={
                                      'deadline': 10,
                                      'max_delay': 1,
                                      'max_tries': -1},
                                  connection_retry={'max_delay': 1, 'max_tries': -1})
        self.client.add_listener(self.session_listener)
        self.client.start()

    def session_listener(self, state):
        pass

    def _poll_exhibitor(self):
        if self.exhibitor.poll():
            self.client.set_hosts(self.exhibitor.zookeeper_hosts + self.chroot)

    def get(self, *params):
        self._poll_exhibitor()
        return self.client.retry(self.client.get, *params)

    def get_children(self, *params):
        self._poll_exhibitor()
        try:
            return self.client.retry(self.client.get_children, *params)
        except NoNodeError:
            return []
Exemple #2
0
def _zk_set_flag(zk: KazooClient, ephemeral: bool = False) -> str:
    """
    Store the `FLAG` value in ZooKeeper in a random Znode.
    """
    znode = '/{}'.format(uuid.uuid4())
    zk.retry(zk.create, znode, makepath=True, ephemeral=ephemeral)
    zk.retry(zk.set, znode, FLAG)
    return znode
Exemple #3
0
class ZooKeeper:
    """Helper ZooKeeper function that handles connection and node updates"""
    @Retry(exception_list=[
        ConnectionLoss, SessionExpiredError, KazooTimeoutError
    ])
    def __init__(self) -> None:
        hosts = settings.ZOO_HOSTS
        retry = KazooRetry(max_tries=-1, max_delay=60)
        self._zk = KazooClient(hosts,
                               connection_retry=retry,
                               command_retry=retry)

        # establish the connection
        self._zk.start()

    def _set_node(self,
                  path: str,
                  value: Optional[Any] = None,
                  ephemeral: bool = False) -> None:
        try:
            self._zk.retry(self._zk.set,
                           path=path,
                           value=pickle.dumps(value) or None)
        except NoNodeError:
            self._create_node(path, value, ephemeral)

    def _get_node(self, path: str) -> Any:
        # NoNodeError needs to be handled differently, so we dont handle it here
        value, *_ = self._zk.retry(self._zk.get, path=path, watch=False)
        return pickle.loads(value)

    def _get_children(self, path: str) -> Any:
        # NoNodeError needs to be handled differently, so we dont handle it here
        value = self._zk.retry(self._zk.get_children, path=path)
        return value

    def _delete_node(self, path: str, recursive: bool = True) -> bool:
        try:
            self._zk.retry(self._zk.delete, path=path, recursive=recursive)
            return True
        except NotEmptyError:
            return False

    def _create_node(self,
                     path: str,
                     value: Optional[Any] = None,
                     ephemeral: bool = False) -> bool:
        try:
            self._zk.retry(self._zk.create,
                           path=path,
                           ephemeral=ephemeral,
                           value=pickle.dumps(value) or None,
                           makepath=True)
            return True
        except NodeExistsError:
            return False
Exemple #4
0
class Root(Record):
    def __init__(self, root_path):
        if not root_path.startswith("/"):
            root_path = "/" + root_path

        super(Root, self).__init__(None, root_path)

        self.lock = Lock()
        self.ZK_retry = KazooRetry(max_tries=-1)
        self.ZK = None

    def connect(self, zookeeper_hosts):
        self.ZK = KazooClient(
            zookeeper_hosts,
            connection_retry=self.ZK_retry,
            command_retry=self.ZK_retry)

        self.ZK.start()

        # create & load collections
        self.clusters = Collection(self, "clusters", Cluster)
        self.meta = Meta(self, "meta")

        return self.load()

    def load(self):
        super(Root, self).load()
        self.clusters.load()
        self.meta.load()
        return self

    def zk_ensure_path(self, *args, **kwargs):
        return self.ZK.retry(self.ZK.ensure_path, *args, **kwargs)

    def zk_set(self, *args, **kwargs):
        return self.ZK.retry(self.ZK.set, *args, **kwargs)

    def zk_get(self, *args, **kwargs):
        return self.ZK.retry(self.ZK.get, *args, **kwargs)

    def zk_get_children(self, *args, **kwargs):
        return self.ZK.retry(self.ZK.get_children, *args, **kwargs)

    def zk_delete(self, *args, **kwargs):
        return self.ZK.retry(self.ZK.delete, *args, **kwargs)
Exemple #5
0
def _zk_flag_exists(zk: KazooClient, znode: str) -> bool:
    """
    The `FLAG` value exists in ZooKeeper at `znode` path.
    """
    try:
        value = zk.retry(zk.get, znode)
    except NoNodeError:
        return False
    return bool(value[0] == FLAG)
Exemple #6
0
class TestSessions(unittest.TestCase):
    def setUp(self):
        from kazoo.client import KazooClient
        from kazoo.protocol.states import KazooState
        from kazoo.testing.common import ZookeeperCluster
        ZK_HOME = os.environ.get("ZOOKEEPER_PATH")
        ZK_CLASSPATH = os.environ.get("ZOOKEEPER_CLASSPATH")
        self.cluster = ZookeeperCluster(ZK_HOME,
                                        size=1,
                                        port_offset=21000,
                                        classpath=ZK_CLASSPATH)
        self.cluster.start()
        atexit.register(lambda cluster: self.cluster.terminate(), self.cluster)
        self.client = KazooClient(self.cluster[0].address, max_retries=5)
        self.ev = threading.Event()

        def back(state):
            if state == KazooState.CONNECTED:
                self.ev.set()

        self.client.start()
        self.path = self.client.create(uuid.uuid4().hex)
        self.client.add_listener(back)

    def test_restarted_server(self):
        raise SkipTest('Patch missing')
        self.cluster.stop()
        self.cluster.start()
        self.ev.wait(5)
        eq_(self.ev.is_set(), True)
        self.assertTrue(self.client.retry(self.client.exists, self.path))

    def test_terminated_server(self):
        raise SkipTest('Patch missing')
        self.cluster.reset()
        self.cluster.start()
        self.ev.wait(5)
        eq_(self.ev.is_set(), True)
        self.assertFalse(self.client.retry(self.client.exists, self.path))

    def tearDown(self):
        self.ev.clear()
        self.client.stop()
        self.cluster.stop()
Exemple #7
0
class TestSessions(unittest.TestCase):

    def setUp(self):
        from kazoo.client import KazooClient
        from kazoo.protocol.states import KazooState
        from kazoo.testing.common import ZookeeperCluster
        ZK_HOME = os.environ.get("ZOOKEEPER_PATH")
        ZK_CLASSPATH = os.environ.get("ZOOKEEPER_CLASSPATH")
        self.cluster = ZookeeperCluster(ZK_HOME, size=1, port_offset=21000, classpath=ZK_CLASSPATH)
        self.cluster.start()
        atexit.register(lambda cluster: self.cluster.terminate(), self.cluster)
        self.client = KazooClient(self.cluster[0].address, max_retries=5)
        self.ev = threading.Event()

        def back(state):
            if state == KazooState.CONNECTED:
                self.ev.set()
        self.client.start()
        self.path = self.client.create(uuid.uuid4().hex)
        self.client.add_listener(back)

    def test_restarted_server(self):
        raise SkipTest('Patch missing')
        self.cluster.stop()
        self.cluster.start()
        self.ev.wait(5)
        eq_(self.ev.is_set(), True)
        self.assertTrue(self.client.retry(self.client.exists, self.path))

    def test_terminated_server(self):
        raise SkipTest('Patch missing')
        self.cluster.reset()
        self.cluster.start()
        self.ev.wait(5)
        eq_(self.ev.is_set(), True)
        self.assertFalse(self.client.retry(self.client.exists, self.path))

    def tearDown(self):
        self.ev.clear()
        self.client.stop()
        self.cluster.stop()
Exemple #8
0
class ZkCacheTaskManager(object):

    RETRIES = 2

    def __init__(self,
                 host='127.0.0.1:2181',
                 lock_path_prefix='/mastermind/cache/'):
        self.client = KazooClient(host, timeout=3)
        logger.info(
            'Connecting to zookeeper host {}, lock_path_prefix: {}'.format(
                host, lock_path_prefix))
        try:
            self.client.start()
        except Exception as e:
            logger.error(e)
            raise

        self.lock_path_prefix = lock_path_prefix

    def put_task(self, task):
        group_id = task['group']
        q = LockingQueue(self.client, self.lock_path_prefix, group_id)
        return q.put(self._serialize(task))

    def put_all(self, tasks):
        for task in tasks:
            self.put_task(task)

    def list(self):
        for group_id in self.client.retry(self.client.get_children,
                                          self.lock_path_prefix):
            for item in LockingQueue(self.client, self.lock_path_prefix,
                                     group_id).list():
                yield self._unserialize(item)

    @staticmethod
    def _serialize(task):
        return msgpack.packb(task)

    @staticmethod
    def _unserialize(task):
        return msgpack.unpackb(task)
class ZkCacheTaskManager(object):

    RETRIES = 2

    def __init__(self, host='127.0.0.1:2181', lock_path_prefix='/mastermind/cache/'):
        self.client = KazooClient(host, timeout=3)
        logger.info('Connecting to zookeeper host {}, lock_path_prefix: {}'.format(
            host, lock_path_prefix))
        try:
            self.client.start()
        except Exception as e:
            logger.error(e)
            raise

        self.lock_path_prefix = lock_path_prefix

    def put_task(self, task):
        group_id = task['group']
        q = LockingQueue(self.client, self.lock_path_prefix, group_id)
        return q.put(self._serialize(task))

    def put_all(self, tasks):
        for task in tasks:
            self.put_task(task)

    def list(self):
        for group_id in self.client.retry(self.client.get_children, self.lock_path_prefix):
            for item in LockingQueue(self.client, self.lock_path_prefix, group_id).list():
                yield self._unserialize(item)

    @staticmethod
    def _serialize(task):
        return msgpack.packb(task)

    @staticmethod
    def _unserialize(task):
        return msgpack.unpackb(task)
def main(args):
    zk = KazooClient(hosts=zk_host)
    zk.start()

    register_emergency_signal_handler(zk)

    try:
        zk.retry(zk.create(path=test_znode, makepath=True))
    except NodeExistsError:
        logging.info('{} already exists, no need to create'.format(test_znode))

    start_command = 'start({},{})'.format(args.message_size, args.topics)
    zk.retry(lambda: zk.set(test_znode, start_command))

    t_start = time.time() # in seconds
    t_end = t_start + args.duration

    while(time.time() < t_end):
        time.sleep(t_end - time.time()) # shouldn't introduce error larger than 10-15 ms

    zk.retry(lambda: zk.set(test_znode, 'stop'))
    zk.stop()
Exemple #11
0
class ServiceRegister(object):
    def __init__(self, hosts="127.0.0.1:2181", read_only=True, logger=None):
        """
        服务注册
        :param hosts: Zookeeper集群地址列表
        :param read_only: 是否只读
        :param logger: 日志对象
        """
        if not logger:
            import logging
            logging.basicConfig()
        self._zk = KazooClient(hosts, read_only=read_only, logger=logger)
        self._zk.start()

    def restart(self):
        self._zk.restart()

    def retry_get(self, path, watcher=None):
        """
        重读
        :param path: 节点路由
        :param watcher: 观察者回调函数
        :return: 成功:节点值,版本号;失败:异常信息,异常代码。
        """
        return self._zk.retry(self.get, path, watcher)

    def lock(self, path, identifier, timeout=None):
        """
        分布式锁
        :param path: 路由
        :param identifier: 锁标识
        :param timeout: 超时时间
        :return: 锁对象
        """
        return DLock(self._zk, path, identifier, timeout)

    def exist(self, path):
        """
        节点是否存在
        :param path: 路由
        :return: 存在返回True,不存在返回False。
        """
        state = self._zk.exists(path)
        return state is not None

    def create(self, path, value=""):
        """
        创建节点
        :param path: 节点路由
        :param value: 节点值
        :return: 节点路由
        """
        try:
            res_path = self._zk.create(path, value, makepath=True)
        except NodeExistsError:
            return path
        except NoNodeError as e:
            return e.message
        except ZookeeperError as e:
            return e.message
        else:
            return res_path

    def get(self, path, watcher=None):
        """
        查节点值
        :param path: 节点路由
        :param watcher: 观察者回调函数
        :return: 成功:节点值,版本号;失败:异常信息,异常代码。
        """
        try:
            data, state = self._zk.get(path)
            self._zk.DataWatch(path, watcher)
        except NoNodeError as e:
            return e.message, -2
        except ZookeeperError as e:
            return e.message, -3
        else:
            return data, state.version

    def get_children(self, path, watcher=None):
        """
        查子节点列表
        :param path: 节点路由
        :param watcher: 观察者回调函数
        :return: 子节点列表
        """
        try:
            data = self._zk.get_children(path)
            self._zk.DataWatch(path, watcher)
        except NoNodeError as e:
            return [], -2
        except ZookeeperError as e:
            return [], -3
        else:
            return data, 0

    def set(self, path, value, version=-1):
        """
        改节点值
        :param path: 节点路由
        :param value: 节点值
        :param version: 成功:版本号;失败:异常信息。
        """
        try:
            state = self._zk.set(path, value, version)
        except BadVersionError as e:
            return e.message
        except NoNodeError as e:
            return e.message
        except ZookeeperError as e:
            return e.message
        else:
            return state.version
# -*- coding: utf-8 -*-
import time
import glob
import sys
import os
from kazoo.client import KazooClient
import logging
logging.basicConfig()

if __name__ == "__main__":
    print("begin.")
    zk = KazooClient(hosts='172.10.3.111:2181')
    zk.start()
    mypath = "/my/favorite"
    result = zk.retry(zk.get, mypath)
    print("result", result)
    zk.stop()
    print("exit.")
Exemple #13
0
class ZooKeeper(AbstractDCS):

    def __init__(self, name, config):
        super(ZooKeeper, self).__init__(name, config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self.exhibitor = None
        if 'exhibitor' in config:
            exhibitor = config['exhibitor']
            interval = exhibitor.get('poll_interval', 300)
            self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=interval)
            hosts = self.exhibitor.zookeeper_hosts

        self.client = KazooClient(hosts=hosts,
                                  timeout=(config.get('session_timeout', None) or 30),
                                  command_retry={
                                      'deadline': (config.get('reconnect_timeout', None) or 10),
                                      'max_delay': 1,
                                      'max_tries': -1},
                                  connection_retry={'max_delay': 1, 'max_tries': -1})
        self.client.add_listener(self.session_listener)
        self.cluster_event = self.client.handler.event_object()

        self.fetch_cluster = True
        self.members = []
        self.leader = None
        self.last_leader_operation = 0

        self.client.start(None)

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self.fetch_cluster = True
        self.cluster_event.set()

    def get_node(self, name, watch=None):
        try:
            return self.client.get(self.client_path(name), watch)
        except NoNodeError:
            pass
        except:
            logger.exception('get_node')
        return None

    @staticmethod
    def member(name, value, znode):
        conn_url, api_url = parse_connection_string(value)
        return Member(znode.mzxid, name, conn_url, api_url, None, None)

    def load_members(self):
        members = []
        for member in self.client.get_children(self.client_path('/members'), self.cluster_watcher):
            data = self.get_node('/members/' + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self.cluster_event.clear()
        leader = self.get_node('/leader', self.cluster_watcher)
        self.members = self.load_members()
        if leader:
            if leader[0] == self._name:
                client_id = self.client.client_id
                if client_id is not None and client_id[0] != leader[1].ephemeralOwner:
                    logger.info('I am leader but not owner of the session. Removing leader node')
                    self.client.delete(self.client_path('/leader'))
                    leader = None

            if leader:
                for member in self.members:
                    if member.name == leader[0]:
                        leader = member
                        self.fetch_cluster = False
                        break
            if not isinstance(leader, Member):
                leader = Member(-1, leader, None, None, None, None)
        self.leader = leader
        if self.fetch_cluster:
            last_leader_operation = self.get_node('/optime/leader')
            if last_leader_operation:
                self.last_leader_operation = int(last_leader_operation[0])

    def get_cluster(self):
        if self.exhibitor and self.exhibitor.poll():
            self.client.set_hosts(self.exhibitor.zookeeper_hosts)

        if self.fetch_cluster:
            try:
                self.client.retry(self._inner_load_cluster)
            except:
                logger.exception('get_cluster')
                self.session_listener(KazooState.LOST)
                raise ZooKeeperError('ZooKeeper in not responding properly')
        return Cluster(True, self.leader, self.last_leader_operation, self.members)

    def _create(self, path, value, **kwargs):
        try:
            self.client.retry(self.client.create, self.client_path(path), value, **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self):
        ret = self._create('/leader', self._name, makepath=True, ephemeral=True)
        ret or logger.info('Could not take out TTL lock')
        return ret

    def race(self, path):
        return self._create(path, self._name, makepath=True)

    def touch_member(self, connection_string, ttl=None):
        for m in self.members:
            if m.name == self._name:
                return True
        path = self.client_path('/members/' + self._name)
        try:
            self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True)
            return True
        except NodeExistsError:
            try:
                self.client.retry(self.client.delete, path)
                self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True)
                return True
            except:
                logger.exception('touch_member')
        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def update_leader(self, state_handler):
        last_operation = state_handler.last_operation()
        if last_operation != self.last_leader_operation:
            self.last_leader_operation = last_operation
            path = self.client_path('/optime/leader')
            try:
                self.client.retry(self.client.set, path, last_operation)
            except NoNodeError:
                try:
                    self.client.retry(self.client.create, path, last_operation, makepath=True)
                except:
                    logger.exception('Failed to create %s', path)
            except:
                logger.exception('Failed to update %s', path)
        return True

    def delete_leader(self):
        if isinstance(self.leader, Member) and self.leader.name == self._name:
            self.client.delete(self.client_path('/leader'))

    def sleep(self, timeout):
        self.cluster_event.wait(timeout)
        if self.cluster_event.isSet():
            self.fetch_cluster = True
Exemple #14
0
class SolrCloudManager:
    def __init__(self, zk_host):
        self.__zk = KazooClient(hosts=zk_host)
        self.__zk.start()

    def __del__(self):
        self.__zk.stop()

    def get_cluster_state(self):
        cs_tuple = self.__zk.retry(self.__zk.get, 'clusterstate.json')
        cs = json.loads(cs_tuple[0])
        return cs

    # Check all replicas that contain node_name
    # Return true if ALL nodes are in the active state
    def replicas_are_active(self, node_name):
        cluster_state = self.get_cluster_state()
        active = True
        for cn, cdata in cluster_state.iteritems():
            for sn, sdata in cdata['shards'].iteritems():
                replica_down = False
                node_in_replica = False
                for rn, rdata in sdata['replicas'].iteritems():
                    if rdata['node_name'] == node_name:
                        node_in_replica = True
                    if rdata['state'] != "active":
                        replica_down = True
                if replica_down and node_in_replica:
                    active = False
            if not active:
                break
        return active

    # Wait for all replicas to enter the active state
    def wait_for_replicas(self, node_name, timeout):
        start_time = time.time()
        ra = self.replicas_are_active(node_name)
        while ((start_time + timeout) > time.time()) and (not ra):
            print "Waiting for replication to finish"
            time.sleep(3)
            ra = self.replicas_are_active(node_name)
        return ra

    def node_is_live(self, node_name):
        live_nodes = self.__zk.retry(self.__zk.get_children, 'live_nodes')
        return (node_name in live_nodes)

    def wait_for_live_node(self, node_name, timeout):
        start_time = time.time()
        lv = self.node_is_live(node_name)
        while ((start_time + timeout) > time.time()) and (not lv):
            print "Waiting for live node"
            time.sleep(3)
            lv = self.node_is_live(node_name)
        return lv

    def _remove_live_node(self, node_name):
        print(green('Deleting: live_nodes/%s' % (node_name)))
        self.__zk.retry(self.__zk.delete, 'live_nodes/' + node_name)
        return True

    def _restart_host_solr_service(self, host):
        print(green('Restarting: %s' % (host)))
        result = sudo("restart solr-undertow")
        if result.failed:
            print(red('Failed to restart: %s' % (host)))
            return False
        return True

    def restart_host_solr(self, host, host_port='8983', force=False, ln_timeout=240, rn_timeout=600):
        if host is None:
            return self._return_message(1, 'host is required')

        node_name = host + ':' + host_port + '_solr'
        if (not force) and (not self.node_is_live(node_name)):
            return self._return_message(10, 'Node is not live')

        # Don't restart if any other replicas are down
        if (not force) and (not self.replicas_are_active(node_name)):
            return self._return_message(20, 'Not all replicas are active')

        # LATER Make sure a reindex isn't in progress

        if not self._remove_live_node(node_name):
            return self._return_message(30, 'Error removing live node')

        if not self._restart_host_solr_service(host):
            return self._return_message(40, 'Error restarting solr service')

        if not self.wait_for_live_node(node_name, ln_timeout):
            return self._return_message(50, 'Timeout waiting for live node')

        if not self.wait_for_replicas(node_name, rn_timeout):
            return self._return_message(60, 'Timeout waiting for replicas')

    def _return_message(self, error_code, message):
        print(red({'status': error_code, 'message': message}))
        sys.exit(error_code)
Exemple #15
0
class ZookeeperServiceRegistry(BaseServiceRegistry):
    def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT):
        super(ZookeeperServiceRegistry, self).__init__()
        self.chroot = chroot
        self.client = KazooClient(
            hosts=hosts,
            handler=SequentialGeventHandler(),
        )
        self.client.add_listener(self.on_kazoo_state_change)
        self.start_count = 0

    @classmethod
    def from_config(cls, config, **kwargs):
        return cls(hosts=config.get('hosts', DEFAULT_HOSTS),
                   chroot=config.get('chroot', DEFAULT_CHROOT),
                   **kwargs)

    def on_start(self, timeout=10):
        self.start_count += 1
        if self.start_count > 1:
            return
        started = self.client.start_async()
        started.wait(timeout=timeout)
        if not self.client.connected:
            raise RuntimeError('could not connect to zookeeper')
        logger.debug('connected to zookeeper (version=%s)',
                     '.'.join(map(str, self.client.server_version())))

    def on_stop(self):
        self.start_count -= 1
        if self.start_count != 0:
            return
        self.client.stop()

    def on_kazoo_state_change(self, state):
        logger.info('kazoo connection state changed to %s', state)

    def on_service_type_watch(self, service, event):
        try:
            if event.type == EventType.CHILD:
                # FIXME: figure out proper retry strategy
                self.client.retry(self.lookup, service.container, service)
        except Exception:
            logger.exception('error in service type watcher')

    def on_service_watch(self, service, event):
        try:
            prefix, service_type, identity = event.path.rsplit('/', 2)
            if event.type == EventType.DELETED:
                service.remove(identity)
        except Exception:
            logger.exception('error in service watcher')

    def _get_service_znode(self, service, service_type, identity):
        path = self._get_zk_path(service_type, identity)
        result = self.client.get_async(path,
                                       watch=functools.partial(
                                           self.on_service_watch, service))
        value, znode = result.get()
        items = six.iteritems(json.loads(value.decode('utf-8')))
        return {str(k): str(v) for k, v in items}

    def discover(self, container):
        result = self.client.get_children_async(path='%s/services' %
                                                self.chroot, )
        return list(result.get())

    def lookup(self, container, service, watch=True, timeout=1):
        def child_watch(event):
            print(event)

        service_type = service.service_type
        result = self.client.get_children_async(
            path='%s/services/%s' % (self.chroot, service_type),
            watch=functools.partial(self.on_service_type_watch, service),
        )
        try:
            names = result.get(timeout=timeout)
        except NoNodeError:
            raise LookupFailure(None,
                                "failed to resolve %s" % service.service_type)
        logger.info("lookup %s %r", service_type, names)
        identities = set(service.identities())
        for name in names:
            kwargs = self._get_service_znode(service, service_type, name)
            identity = kwargs.pop('identity')
            service.update(identity, **kwargs)
            try:
                identities.remove(identity)
            except KeyError:
                pass
        for identity in identities:
            service.remove(identity)
        return service

    def _get_zk_path(self, service_type, identity):
        return '%s/services/%s/%s' % (self.chroot, service_type, identity)

    def register(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        value = json.dumps({
            'endpoint': container.endpoint,
            'identity': container.identity,
            'log_endpoint': container.log_endpoint,
        })
        result = self.client.create_async(path,
                                          value.encode('utf-8'),
                                          ephemeral=True,
                                          makepath=True)
        # FIXME: result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)

    def unregister(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        result = self.client.delete_async(path)
        result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)
Exemple #16
0
class ZooKeeper(AbstractDCS):
    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        mapping = {
            'use_ssl': 'use_ssl',
            'verify': 'verify_certs',
            'cacert': 'ca',
            'cert': 'certfile',
            'key': 'keyfile',
            'key_password': '******'
        }
        kwargs = {v: config[k] for k, v in mapping.items() if k in config}

        self._client = KazooClient(
            hosts,
            handler=PatroniSequentialThreadingHandler(config['retry_timeout']),
            timeout=config['ttl'],
            connection_retry=KazooRetry(max_delay=1,
                                        max_tries=-1,
                                        sleep_func=time.sleep),
            command_retry=KazooRetry(deadline=config['retry_timeout'],
                                     max_delay=1,
                                     max_tries=-1,
                                     sleep_func=time.sleep),
            **kwargs)
        self._client.add_listener(self.session_listener)

        self._fetch_cluster = True
        self._fetch_optime = True

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, *args):
        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(*args)
        return max(self.loop_wait - 2, 2) * 1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def optime_watcher(self, event):
        self._fetch_optime = True
        self.event.set()

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.optime_watcher(event)

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    @property
    def ttl(self):
        return self._client._session_timeout

    def set_retry_timeout(self, retry_timeout):
        retry = self._client.retry if isinstance(
            self._client.retry, KazooRetry) else self._client._retry
        retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    def get_leader_optime(self, leader):
        watch = self.optime_watcher if not leader or leader.name != self._name else None
        optime = self.get_node(self.leader_optime_path, watch)
        self._fetch_optime = False
        return optime and int(optime[0]) or 0

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner,
                                value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self, sync_standby):
        members = []
        for member in self.get_children(self.members_path,
                                        self.cluster_watcher):
            watch = member in sync_standby and self.cluster_watcher or None
            data = self.get_node(self.members_path + member, watch)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(
            self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path)
                      or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(
            self.config_path,
            watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version,
                                                    config[0], config[1].mzxid)

        # get timeline history
        history = self.get_node(
            self.history_path,
            watch=self.cluster_watcher) if self._HISTORY in nodes else None
        history = history and TimelineHistory.from_node(
            history[1].mzxid, history[0])

        # get synchronization state
        sync = self.get_node(
            self.sync_path,
            watch=self.cluster_watcher) if self._SYNC in nodes else None
        sync = SyncState.from_node(sync and sync[1].version, sync and sync[0])

        # get list of members
        sync_standby = sync.leader == self._name and sync.members or []
        members = self.load_members(
            sync_standby) if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(
            self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if not self._ctl and leader[0] == self._name and client_id is not None \
                    and client_id[0] != leader[1].ephemeralOwner:
                logger.info(
                    'I am leader but not owner of the session. Removing leader node'
                )
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]]
                          or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner,
                                member)
                self._fetch_cluster = member.index == -1

        # get last leader operation
        last_leader_operation = self._OPTIME in nodes and self.get_leader_optime(
            leader)

        # failover key
        failover = self.get_node(
            self.failover_path,
            watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version,
                                                   failover[0])

        return Cluster(initialize, config, leader, last_leader_operation,
                       members, failover, sync, history)

    def _load_cluster(self):
        cluster = self.cluster
        if self._fetch_cluster or cluster is None:
            try:
                cluster = self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')
        # Optime ZNode was updated or doesn't exist and we are not leader
        elif (self._fetch_optime and not self._fetch_cluster or not cluster.last_leader_operation) and\
                not (cluster.leader and cluster.leader.name == self._name):
            try:
                optime = self.get_leader_optime(cluster.leader)
                cluster = Cluster(cluster.initialize, cluster.config,
                                  cluster.leader, optime, cluster.members,
                                  cluster.failover, cluster.sync,
                                  cluster.history)
            except Exception:
                pass
        return cluster

    def _bypass_caches(self):
        self._fetch_cluster = True

    def _create(self, path, value, retry=False, ephemeral=False):
        try:
            if retry:
                self._client.retry(self._client.create,
                                   path,
                                   value,
                                   makepath=True,
                                   ephemeral=ephemeral)
            else:
                self._client.create_async(path,
                                          value,
                                          makepath=True,
                                          ephemeral=ephemeral).get(timeout=1)
            return True
        except Exception:
            logger.exception('Failed to create %s', path)
        return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path,
                           self._name.encode('utf-8'),
                           retry=True,
                           ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def _set_or_create(self,
                       key,
                       value,
                       index=None,
                       retry=False,
                       do_not_create_empty=False):
        value = value.encode('utf-8')
        try:
            if retry:
                self._client.retry(self._client.set,
                                   key,
                                   value,
                                   version=index or -1)
            else:
                self._client.set_async(key, value, version=index
                                       or -1).get(timeout=1)
            return True
        except NoNodeError:
            if do_not_create_empty and not value:
                return True
            elif index is None:
                return self._create(key, value, retry)
            else:
                return False
        except Exception:
            logger.exception('Failed to update %s', key)
        return False

    def set_failover_value(self, value, index=None):
        return self._set_or_create(self.failover_path, value, index)

    def set_config_value(self, value, index=None):
        return self._set_or_create(self.config_path, value, index, retry=True)

    def initialize(self, create_new=True, sysid=""):
        sysid = sysid.encode('utf-8')
        return self._create(self.initialize_path, sysid, retry=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path, sysid)

    def touch_member(self, data, permanent=False):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name,
                                                fallback_to_leader=False)
        encoded_data = json.dumps(data, separators=(',', ':')).encode('utf-8')
        if member and (self._client.client_id is not None
                       and member.session != self._client.client_id[0] or
                       not (deep_compare(member.data.get('tags', {}),
                                         data.get('tags', {})) and
                            member.data.get('version') == data.get('version')
                            and member.data.get('checkpoint_after_promote')
                            == data.get('checkpoint_after_promote'))):
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except Exception:
                return False
            member = None

        if member:
            if deep_compare(data, member.data):
                return True
        else:
            try:
                self._client.create_async(
                    self.member_path,
                    encoded_data,
                    makepath=True,
                    ephemeral=not permanent).get(timeout=1)
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path,
                                   encoded_data).get(timeout=1)
            return True
        except Exception:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def _write_leader_optime(self, last_operation):
        return self._set_or_create(self.leader_optime_path, last_operation)

    def _update_leader(self):
        return True

    def _delete_leader(self):
        self._client.restart()
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except Exception:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete,
                                      self.client_path(''),
                                      recursive=True)
        except NoNodeError:
            return True

    def set_history_value(self, value):
        return self._set_or_create(self.history_path, value)

    def set_sync_state_value(self, value, index=None):
        return self._set_or_create(self.sync_path,
                                   value,
                                   index,
                                   retry=True,
                                   do_not_create_empty=True)

    def delete_sync_state(self, index=None):
        return self.set_sync_state_value("{}", index)

    def watch(self, leader_index, timeout):
        if super(ZooKeeper, self).watch(leader_index,
                                        timeout) and not self._fetch_optime:
            self._fetch_cluster = True
        return self._fetch_cluster
class ZooKeeper(AbstractDCS):

    def __init__(self, name, config):
        super(ZooKeeper, self).__init__(name, config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self.exhibitor = None
        if 'exhibitor' in config:
            exhibitor = config['exhibitor']
            interval = exhibitor.get('poll_interval', 300)
            self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=interval)
            hosts = self.exhibitor.zookeeper_hosts

        self._client = KazooClient(hosts=hosts, timeout=(config.get('session_timeout') or 30),
                                   command_retry={'deadline': (config.get('reconnect_timeout') or 10),
                                                  'max_delay': 1, 'max_tries': -1},
                                   connection_retry={'max_delay': 1, 'max_tries': -1})
        self._client.add_listener(self.session_listener)

        self._my_member_data = None
        self._fetch_cluster = True
        self._last_leader_operation = 0

        self._client.start()

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.event.set()

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner, value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self):
        members = []
        for member in self.get_children(self.members_path, self.cluster_watcher):
            data = self.get_node(self.members_path + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None

        # get list of members
        members = self.load_members() if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner:
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]] or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner, member)
                self._fetch_cluster = member.index == -1

        # failover key
        failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        if failover:
            failover = Failover.from_node(failover[1].version, failover[0])

        # get last leader operation
        optime = self.get_node(self.leader_optime_path) if self._OPTIME in nodes and self._fetch_cluster else None
        self._last_leader_operation = 0 if optime is None else int(optime[0])
        self._cluster = Cluster(initialize, leader, self._last_leader_operation, members, failover)

    def _load_cluster(self):
        if self.exhibitor and self.exhibitor.poll():
            self._client.set_hosts(self.exhibitor.zookeeper_hosts)

        if self._fetch_cluster or self._cluster is None:
            try:
                self._client.retry(self._inner_load_cluster)
            except:
                logger.exception('get_cluster')
                self.session_listener(KazooState.LOST)
                raise ZooKeeperError('ZooKeeper in not responding properly')

    def _create(self, path, value, **kwargs):
        try:
            self._client.retry(self._client.create, path, value.encode('utf-8'), **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self):
        ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=True)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def set_failover_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.failover_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return value == '' or (not index and self._create(self.failover_path, value))
        except:
            logging.exception('set_failover_value')
            return False

    def initialize(self, create_new=True, sysid=""):
        return self._create(self.initialize_path, sysid, makepath=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path,  sysid.encode("utf-8"))

    def touch_member(self, data, ttl=None):
        cluster = self.cluster
        member = cluster and ([m for m in cluster.members if m.name == self._name] or [None])[0]
        path = self.member_path
        data = data.encode('utf-8')
        if member and self._client.client_id is not None and member.session != self._client.client_id[0]:
            try:
                self._client.retry(self._client.delete, path)
            except NoNodeError:
                pass
            except:
                return False
            member = None

        if member and data == self._my_member_data:
            return True

        try:
            if member:
                self._client.retry(self._client.set, path, data)
            else:
                self._client.retry(self._client.create, path, data, makepath=True, ephemeral=True)
            self._my_member_data = data
            return True
        except NodeExistsError:
            try:
                self._client.retry(self._client.set, path, data)
                self._my_member_data = data
                return True
            except:
                logger.exception('touch_member')
        except:
            logger.exception('touch_member')
        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def write_leader_optime(self, last_operation):
        last_operation = last_operation.encode('utf-8')
        if last_operation != self._last_leader_operation:
            self._last_leader_operation = last_operation
            path = self.leader_optime_path
            try:
                self._client.retry(self._client.set, path, last_operation)
            except NoNodeError:
                try:
                    self._client.retry(self._client.create, path, last_operation, makepath=True)
                except:
                    logger.exception('Failed to create %s', path)
            except:
                logger.exception('Failed to update %s', path)

    def update_leader(self):
        return True

    def delete_leader(self):
        self._client.restart()
        self._my_member_data = None
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete, self.client_path(''), recursive=True)
        except NoNodeError:
            return True

    def watch(self, timeout):
        if super(ZooKeeper, self).watch(timeout):
            self._fetch_cluster = True
        return self._fetch_cluster
Exemple #18
0
    #update
    try:
        result = None
        result = zk.set("/xy/test", b"some data")
    except Exception, e:
        print('exception when zk.set, %s' % e)
    else:
        print("zk.set /xy/test result %s" % str(result))

    # del
    result = zk.delete("/xy/test/node", recursive=True)
    print("zk.delete /xy/test/node result %s" % (result))

    # action
    try:
        result = zk.retry(zk.get, "/xy/test/nodex")
    except Exception, e:
        print('exception when zk.retry, %s' % e)
    else:
        print("zk.retry /xy/test/nodex result %s" % str(result))

    from kazoo.retry import KazooRetry
    kr = KazooRetry(max_tries=3, ignore_expire=False)
    try:
        result = kr(zk.get, "/xy/test/nodex")
    except Exception, e:
        print('exception when KazooRetry, %s' % e)
    else:
        print("KazooRetry zk.get /xy/test/nodex result %s" % (result))

    #watcher
Exemple #19
0
class ZooKeeper(AbstractDCS):

    def __init__(self, name, config):
        super(ZooKeeper, self).__init__(name, config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self.exhibitor = None
        if 'exhibitor' in config:
            exhibitor = config['exhibitor']
            interval = exhibitor.get('poll_interval', 300)
            self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=interval)
            hosts = self.exhibitor.zookeeper_hosts

        self.client = KazooClient(hosts=hosts,
                                  timeout=(config.get('session_timeout', None) or 30),
                                  command_retry={
                                      'deadline': (config.get('reconnect_timeout', None) or 10),
                                      'max_delay': 1,
                                      'max_tries': -1},
                                  connection_retry={'max_delay': 1, 'max_tries': -1})
        self.client.add_listener(self.session_listener)
        self.cluster_event = self.client.handler.event_object()

        self.cluster = None
        self.fetch_cluster = True
        self.last_leader_operation = 0

        self.client.start(None)

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self.fetch_cluster = True
        self.cluster_event.set()

    def get_node(self, key, watch=None):
        try:
            ret = self.client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        conn_url, api_url = parse_connection_string(value)
        return Member(znode.version, name, conn_url, api_url, None, None)

    def get_children(self, key, watch=None):
        try:
            return self.client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self):
        members = []
        for member in self.get_children(self.members_path, self.cluster_watcher):
            data = self.get_node(self.members_path + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self.cluster_event.clear()
        nodes = set(self.get_children(self.client_path('')))

        # get initialize flag
        initialize = self._INITIALIZE in nodes

        # get list of members
        members = self.load_members() if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(self.leader_path, self.cluster_watcher) if self._LEADER in nodes else None
        if leader:
            client_id = self.client.client_id
            if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner:
                logger.info('I am leader but not owner of the session. Removing leader node')
                self.client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, None, None, None)
                member = ([m for m in members if m.name == leader[0]] or [member])[0]
                leader = Leader(leader[1].version, None, None, member)
                self.fetch_cluster = member.index == -1

        # get last leader operation
        self.last_leader_operation = self.get_node(self.leader_optime_path) if self.fetch_cluster else None
        self.last_leader_operation = 0 if self.last_leader_operation is None else int(self.last_leader_operation[0])
        self.cluster = Cluster(initialize, leader, self.last_leader_operation, members)

    def get_cluster(self):
        if self.exhibitor and self.exhibitor.poll():
            self.client.set_hosts(self.exhibitor.zookeeper_hosts)

        if self.fetch_cluster:
            try:
                self.client.retry(self._inner_load_cluster)
            except:
                logger.exception('get_cluster')
                self.session_listener(KazooState.LOST)
                raise ZooKeeperError('ZooKeeper in not responding properly')
        return self.cluster

    def _create(self, path, value, **kwargs):
        try:
            self.client.retry(self.client.create, path, value.encode('utf-8'), **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self):
        ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=True)
        ret or logger.info('Could not take out TTL lock')
        return ret

    def initialize(self):
        return self._create(self.initialize_path, self._name, makepath=True)

    def touch_member(self, connection_string, ttl=None):
        if self.cluster and any(m.name == self._name for m in self.cluster.members):
            return True
        path = self.member_path
        connection_string = connection_string.encode('utf-8')
        try:
            self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True)
            return True
        except NodeExistsError:
            try:
                self.client.retry(self.client.delete, path)
                self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True)
                return True
            except:
                logger.exception('touch_member')
        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def update_leader(self, state_handler):
        last_operation = state_handler.last_operation().encode('utf-8')
        if last_operation != self.last_leader_operation:
            self.last_leader_operation = last_operation
            path = self.leader_optime_path
            try:
                self.client.retry(self.client.set, path, last_operation)
            except NoNodeError:
                try:
                    self.client.retry(self.client.create, path, last_operation, makepath=True)
                except:
                    logger.exception('Failed to create %s', path)
            except:
                logger.exception('Failed to update %s', path)
        return True

    def delete_leader(self):
        if isinstance(self.cluster, Cluster) and self.cluster.leader.name == self._name:
            self.client.delete(self.leader_path, version=self.cluster.leader.index)

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node and node[0] == self._name:
            self.client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self.client.retry(self._cancel_initialization)
        except:
            logger.exception("Unable to delete initialize key")

    def watch(self, timeout):
        self.cluster_event.wait(timeout)
        if self.cluster_event.isSet():
            self.fetch_cluster = True
Exemple #20
0
class Elector(threading.Thread):
    LOCKING, NOLOCK = 1,2
    def __init__(self, zha):
        threading.Thread.__init__(self)
        self.zha = zha
        self.should_run = True
        self.in_entry_act = False
        self.in_entry_sby = False

        self.state = Elector.NOLOCK
        self.zk = KazooClient(hosts=self.zha.config.get("connection_string","127.0.0.1:2181"), logger=logger)
        self.zk.add_listener(self.zk_listener)
        self.zk.start()
        self.id = self.zha.config.get("id")
        self.lock = self.zk.Lock(self.zha.config.get("lock_znode","/zha-lock"), self.id)
        self.abcpath = self.zha.config.get("abc_znode","/zha-abc")

    #callbacks
    def on_become_active(self):
        if self.zha.config.become_active() == 0:
            logger.info("successfully become active")
            self.zha.set_state("ACT:HEALTHY")
            return True
        else:
            logger.info("activation failed..")
            return False

    def on_become_active_to_standby(self):
        self.zha.set_state("SBY:UNKNOWN") # state changed to SBY anyway.
        if self.zha.config.become_standby_from_active() == 0:
            logger.info("successfully become standby")
            return True
        else:
            logger.info("could not retire cleanly...")
            return False

    def on_fence(self):
        if self.zha.config.trigger_fence() == 0:
            logger.info("shooted the node")
            return True
        else:
            logger.info("could not retire cleanly...")
            return False

    def run(self):
        while self.should_run:
            self.in_elector_loop()
            time.sleep(self.zha.config.get("elector_interval",3))
        self.retire()
        self.zk.stop()
        logger.info("elector thread stopped.")

    def in_elector_loop(self):
        if self.zk.state != KazooState.CONNECTED:
            # zk listener will callback on LOST, so no need to call self.retire(),
            # but it takes a bit long to be LOST. Mostly other zha will fence me.
            return
        #for locker
        if self.state == Elector.LOCKING:
            if self.in_entry_act is False:
                self.retire()
                return
            return
        #for waiters 
        try:
            lock_result = self.lock.acquire(timeout=self.zha.config.get("elector_interval",3))
        except LockTimeout:
            self.retire()
            logger.info("lock timeout")
            return
        if self.in_entry_act is False:
            self.retire()
            return
        if self.handle_abc() is False:
            self.retire()
            return
        if self.on_become_active() is False:
            self.zk_delete_my_abc()
            self.retire()
            return
        # if reached here, all done with lock
        self.state = Elector.LOCKING

    def zk_listener(self,zkstate):
        logger.info("zookeeper connection state changed %s"%(zkstate,) )
        if zkstate == KazooState.LOST:
            logger.info("(connection to zookeeper is lost/closed)")
            if self.state != Elector.LOCKING:
                return
            logger.info("become standby due to zk connection problem.")
            self.on_become_active_to_standby()
            self.state = Elector.NOLOCK
        elif zkstate == KazooState.SUSPENDED:
            return
        else:
            return

    def retire(self):
        if self.state == Elector.LOCKING:
            if self.on_become_active_to_standby():
                self.zk_delete_my_abc() #dont care it succeeds or not, that is, may become standby leaving abc behind.
        self.state = Elector.NOLOCK
        self.lock.release()

    def handle_abc(self):
        if not self.zk.retry(self.zk.exists,self.abcpath):
            self.zk.retry(self.zk.create, self.abcpath, self.id)
            return True
        data, stat = self.zk.retry(self.zk.get, self.abcpath)
        if data.strip()==self.id:
            return True
        else:
            if self.on_fence() is False:
                return False
            self.zk.retry(self.zk.set, self.abcpath, self.id)
        return True

    def zk_delete_my_abc(self):
        try:
            data, stat = self.zk.get(self.abcpath)
            assert data.strip() == self.id
            self.zk.delete(self.abcpath)
            return True
        except:
            return False
Exemple #21
0
class DeploymentConfig(object):
  """ Accesses deployment configuration options. """
  # The ZooKeeper node where configuration is stored.
  CONFIG_ROOT = '/appscale/config'

  def __init__(self, hosts):
    """ Creates new DeploymentConfig object.

    Args:
      hosts: A list of ZooKeeper hosts.
    """
    self.logger = logging.getLogger(self.__class__.__name__)
    self.update_lock = Lock()
    self.state = ConfigStates.LOADING
    self.config = {}
    self.conn = KazooClient(hosts=hosts, read_only=True)
    self.conn.add_listener(self._conn_listener)
    self.conn.start()
    self.conn.ensure_path(self.CONFIG_ROOT)
    self.conn.ChildrenWatch(self.CONFIG_ROOT, func=self._update_config)

  def _conn_listener(self, state):
    """ Handles changes in ZooKeeper connection state.

    Args:
      state: A string indicating the new state.
    """
    if state == KazooState.LOST:
      self.logger.warning('ZK connection lost')
    if state == KazooState.SUSPENDED:
      self.logger.warning('ZK connection suspended')
    else:
      self.logger.info('ZK connection established')

  def _load_child(self, child):
    """ Fetches the data for a configuration node.

    Args:
      child: A string containing the ZooKeeper node to fetch.
    Returns:
      A dictionary containing configuration data.
    Raises:
      InaccessibleConfig if ZooKeeper is not accessible.
    """
    node = '/'.join([self.CONFIG_ROOT, child])
    try:
      data, _ = self.conn.retry(self.conn.get, node)
    except (KazooException, ZookeeperError):
      raise ConfigInaccessible('ZooKeeper connection not available')
    except NoNodeError:
      return {}

    try:
      return json.loads(data)
    except ValueError:
      self.logger.warning('Invalid deployment config: {}'.format(child))
      return {}

  def _update_config(self, children):
    """ Updates configuration when it changes.

    Args:
      children: A list of ZooKeeper nodes.
    """
    with self.update_lock:
      self.state = ConfigStates.LOADING

      # Ensure old sections are removed.
      self.config = {}

      for child in children:
        while True:
          try:
            self.config[child] = self._load_child(child)
            break
          except ConfigInaccessible as load_error:
            self.logger.warning(str(load_error))
            time.sleep(SMALL_WAIT)

      self.logger.info('Deployment configuration updated')
      self.state = ConfigStates.LOADED

  def get_config(self, section):
    """ Fetches the configuration for a given section.

    Args:
      section: A string specifying the section to fetch.
    Returns:
      A dictionary containing configuration data.
    Raises:
      InaccessibleConfig if ZooKeeper is inaccessible.
    """
    # If the connection is established, it should finish loading very soon.
    while (self.state == ConfigStates.LOADING and
           self.conn.state not in (KazooState.LOST, KazooState.SUSPENDED)):
      time.sleep(TINY_WAIT)

    if self.state != ConfigStates.LOADED:
      raise ConfigInaccessible('ZooKeeper connection not available')

    with self.update_lock:
      if section not in self.config:
        return {}
      return self.config[section]

  def close(self):
    """ Close the ZooKeeper connection. """
    self.conn.stop()
Exemple #22
0
class ZooKeeper(AbstractDCS):
    def __init__(self, name, config):
        super(ZooKeeper, self).__init__(name, config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self.exhibitor = None
        if 'exhibitor' in config:
            exhibitor = config['exhibitor']
            interval = exhibitor.get('poll_interval', 300)
            self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'],
                                                       exhibitor['port'],
                                                       poll_interval=interval)
            hosts = self.exhibitor.zookeeper_hosts

        self.client = KazooClient(hosts=hosts,
                                  timeout=(config.get('session_timeout', None)
                                           or 30),
                                  command_retry={
                                      'deadline':
                                      (config.get('reconnect_timeout', None)
                                       or 10),
                                      'max_delay':
                                      1,
                                      'max_tries':
                                      -1
                                  },
                                  connection_retry={
                                      'max_delay': 1,
                                      'max_tries': -1
                                  })
        self.client.add_listener(self.session_listener)
        self.cluster_event = self.client.handler.event_object()

        self.fetch_cluster = True
        self.members = []
        self.leader = None
        self.last_leader_operation = 0

        self.client.start(None)

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self.fetch_cluster = True
        self.cluster_event.set()

    def get_node(self, name, watch=None):
        try:
            return self.client.get(self.client_path(name), watch)
        except NoNodeError:
            pass
        except:
            logger.exception('get_node')
        return None

    @staticmethod
    def member(name, value, znode):
        conn_url, api_url = parse_connection_string(value)
        return Member(znode.mzxid, name, conn_url, api_url, None, None)

    def load_members(self):
        members = []
        for member in self.client.get_children(self.client_path('/members'),
                                               self.cluster_watcher):
            data = self.get_node('/members/' + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self.cluster_event.clear()
        leader = self.get_node('/leader', self.cluster_watcher)
        self.members = self.load_members()
        if leader:
            if leader[0] == self._name:
                client_id = self.client.client_id
                if client_id is not None and client_id[0] != leader[
                        1].ephemeralOwner:
                    logger.info(
                        'I am leader but not owner of the session. Removing leader node'
                    )
                    self.client.delete(self.client_path('/leader'))
                    leader = None

            if leader:
                for member in self.members:
                    if member.name == leader[0]:
                        leader = member
                        self.fetch_cluster = False
                        break
            if not isinstance(leader, Member):
                leader = Member(-1, leader, None, None, None, None)
        self.leader = leader
        if self.fetch_cluster:
            last_leader_operation = self.get_node('/optime/leader')
            if last_leader_operation:
                self.last_leader_operation = int(last_leader_operation[0])

    def get_cluster(self):
        if self.exhibitor and self.exhibitor.poll():
            self.client.set_hosts(self.exhibitor.zookeeper_hosts)

        if self.fetch_cluster:
            try:
                self.client.retry(self._inner_load_cluster)
            except:
                logger.exception('get_cluster')
                self.session_listener(KazooState.LOST)
                raise ZooKeeperError('ZooKeeper in not responding properly')
        return Cluster(True, self.leader, self.last_leader_operation,
                       self.members)

    def _create(self, path, value, **kwargs):
        try:
            self.client.retry(self.client.create, self.client_path(path),
                              value, **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self):
        ret = self._create('/leader',
                           self._name,
                           makepath=True,
                           ephemeral=True)
        ret or logger.info('Could not take out TTL lock')
        return ret

    def race(self, path):
        return self._create(path, self._name, makepath=True)

    def touch_member(self, connection_string, ttl=None):
        for m in self.members:
            if m.name == self._name:
                return True
        path = self.client_path('/members/' + self._name)
        try:
            self.client.retry(self.client.create,
                              path,
                              connection_string,
                              makepath=True,
                              ephemeral=True)
            return True
        except NodeExistsError:
            try:
                self.client.retry(self.client.delete, path)
                self.client.retry(self.client.create,
                                  path,
                                  connection_string,
                                  makepath=True,
                                  ephemeral=True)
                return True
            except:
                logger.exception('touch_member')
        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def update_leader(self, state_handler):
        last_operation = state_handler.last_operation()
        if last_operation != self.last_leader_operation:
            self.last_leader_operation = last_operation
            path = self.client_path('/optime/leader')
            try:
                self.client.retry(self.client.set, path, last_operation)
            except NoNodeError:
                try:
                    self.client.retry(self.client.create,
                                      path,
                                      last_operation,
                                      makepath=True)
                except:
                    logger.exception('Failed to create %s', path)
            except:
                logger.exception('Failed to update %s', path)
        return True

    def delete_leader(self):
        if isinstance(self.leader, Member) and self.leader.name == self._name:
            self.client.delete(self.client_path('/leader'))

    def sleep(self, timeout):
        self.cluster_event.wait(timeout)
        if self.cluster_event.isSet():
            self.fetch_cluster = True
Exemple #23
0
class ZkCoordinator(StaticCoordinator):
    @staticmethod
    def fromGroup(zkConnect: str, group: str) -> 'ZkCoordinator':
        """Convenience method for instantiation using conventional paths based on group.

        The path convention is:

            /static_assignment/[group]/assignments
            /static_assignment/[group]/members

        Args:
            zkConnect (str): Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183).
            group (str): The name of the consumer group this coordinator belongs to. Must not be None.
        """

        if group is None or len(group.strip()) == 0:
            raise ValueError('ZkCoordinator: Invalid `group` argument, it must not be None or blank.')

        prePath = f'/static_assignment/{group.strip()}'
        assignmentPath = f'{prePath}/assignment'
        membersPath = f'{prePath}/members'
        return ZkCoordinator(zkConnect, membersPath, assignmentPath)

    def __init__(self, zkConnect: str, membersPath: str, assignmentsPath: str):
        """Zookeeper implementation of `StaticCoordinator`

        Args:
            zkConnect (str): Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183).
            membersPath (str): Zookeeper path at which members will create ephemeral nodes asserting their ID.
            assignmentsPath (str): Zookeeper path at which the current assignments are kept.
        """

        for val, name in ((zkConnect, 'zkConnect'), (membersPath, 'membersPath'), (assignmentsPath, 'assignmentsPath')):
            if val is None or len(val.strip()) == 0:
                raise ValueError(f'ZkCoordinator: Invalid `{name}` argument, it must not be None or blank')

        logger.info('ZKCoordinator starting with, membersPath=%s, assignmentsPath=%s', membersPath, assignmentsPath)
        self._zkConnect = zkConnect

        self._membersPath = membersPath
        self._membersPathEnsured = False

        self._assignmentsPath = assignmentsPath
        self._assignmentsPathEnsured = False
        self._currentAssignment = None
        self._assignmentsWatcher = None
        self._memberMetaData: Optional[StaticMemberMeta] = None

        self.zk = KazooClient(hosts=zkConnect)
        self.zk.add_listener(self._zkListener())
        self._memberId: Optional[MemberId] = None

    def _zkListener(self):
        def listener(state):
            if state == KazooState.LOST:
                self._memberId = None
                self._currentAssignment = None

        return listener

    def _establishSession(self):

        if self._assignmentsWatcher is None:
            # add watch for assignment updates
            def watchAssignments(data, stat, event):
                self._currentAssignment = self._processAssignmentsData(data)
                logger.info('Assignment update received. | assignments= %s', self._currentAssignment)

            self._ensureAssignmentsPath()
            self._assignmentWatcher = DataWatch(self.zk, self._assignmentsPath, watchAssignments)

    def _ensureAssignmentsPath(self):
        if not self._assignmentsPathEnsured:
            self.zk.ensure_path(self._assignmentsPath)
            self._assignmentsPathEnsured = True

    def _fetchAssignments(self) -> Optional[Assignments]:
        return self._currentAssignment

    def _processAssignmentsData(self, rawData):
        if rawData is not None:
            return Assignments.fromJson(rawData.decode('utf-8'))

    def _ensureMembersPath(self):
        if not self._membersPathEnsured:
            self.zk.ensure_path(self._membersPath)
            self._membersPathEnsured = True

    def _createPath(self, altMemberId: MemberId = None):
        mid = self._memberId

        if altMemberId is not None:
            mid = altMemberId

        if mid is not None:
            return f'{self._membersPath}/{mid}'

        return None

    def _encodeMemberData(self, meta: StaticMemberMeta):
        return ujson.dumps(meta.asDict()).encode('utf-8')

    def _compareAndUpdateMemberData(self, meta: StaticMemberMeta):
        newDict = None
        selfDict = None
        if self._memberMetaData is not None and meta is not None:
            selfDict = self._memberMetaData.asDict()
            newDict = meta.asDict()

            isDiff = (
                selfDict['hostId'] != newDict['hostId']
                or selfDict['assignment']['configVersion'] != newDict['assignment']['configVersion']
                or selfDict['assignment']['version'] != newDict['assignment']['version']
            )
        else:
            isDiff = True

        if isDiff:
            self._memberMetaData = meta

            path = self._createPath()
            if path is not None:

                def cb(async_obj):
                    try:
                        async_obj.get()
                        logger.info('Member meta data updated. | metaData=%s', meta)
                    except (ConnectionLoss, SessionExpiredError):
                        logger.exception('Failed to update member meta data.')

                self.zk.set_async(path, self._encodeMemberData(meta)).rawlink(cb)

    def updateAssignments(self, meta: StaticMemberMeta, newAssignments: Assignments):
        self.zk.retry(self._innerUpdateAssignment, newAssignments)

    def _innerUpdateAssignment(self, assignment: Assignments):
        self._ensureAssignmentsPath()
        self.zk.set(self._assignmentsPath, assignment.asJson().encode('utf-8'))
        logger.info('Assignments updated. | assignments=%s', assignment)

    def leave(self, meta: StaticMemberMeta):
        self.zk.retry(self._innerLeave)

    def _innerLeave(self):
        path = self._createPath()
        if path is not None:
            try:
                self.zk.delete(path)
            except (ConnectionLoss, SessionExpiredError):
                logger.exception(
                    'Failed to relinquish member ID, '
                    "will assume ephemeral node will expire on it's own. "
                    '| memberId=%s',
                    self._memberId,
                )
            self._memberId = None

    def join(self, meta: StaticMemberMeta):

        asgns = self._fetchAssignments()
        if asgns is None:
            logger.warning('Cannot join a group without assignments. | assignmentsPath=%s', self._assignmentsPath)
            return None

        if self._memberId is None:
            self._memberId = self._inner_join(meta, asgns.maxMembers)

        return self._memberId

    def _inner_join(self, meta: StaticMemberMeta, maxMembers: int) -> Optional[MemberId]:
        idList = range(maxMembers)
        memberData = self._encodeMemberData(meta)
        self._ensureMembersPath()

        foundMid = None

        for mid in idList:
            memberIdPath = self._createPath(mid)

            try:
                self.zk.create(memberIdPath, memberData, ephemeral=True)
                foundMid = mid
                logging.debug('Member id acquired. | memberId=%s', mid)
                break
            except NodeExistsError:
                # move onto the next node
                logger.debug('Member id already taken moving to next. | memberId=%s', mid)
            except (ConnectionLoss, SessionExpiredError):
                logger.exception('Member id acquisition attempt failed with error.')
                time.sleep(1)

        self._memberMetaData = meta
        return foundMid

    def assignments(self, meta: StaticMemberMeta) -> Optional[Assignments]:
        self._compareAndUpdateMemberData(meta)
        return self._fetchAssignments()

    def heartbeat(self, meta: StaticMemberMeta) -> Optional[MemberId]:
        self._compareAndUpdateMemberData(meta)
        return self._memberId

    def stop(self):
        self.zk.stop()
        self.zk.close()

    def start(self):
        self.zk.start()
        self._establishSession()
Exemple #24
0
class ZookeeperServiceRegistry(BaseServiceRegistry):
    def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT):
        super(ZookeeperServiceRegistry, self).__init__()
        self.chroot = chroot
        self.client = KazooClient(
            hosts=hosts,
            handler=SequentialGeventHandler(),
        )
        self.client.add_listener(self.on_kazoo_state_change)
        self.start_count = 0

    @classmethod
    def from_config(cls, config, **kwargs):
        return cls(
            hosts=config.get('hosts', DEFAULT_HOSTS),
            chroot=config.get('chroot', DEFAULT_CHROOT),
            **kwargs
        )

    def on_start(self, timeout=10):
        self.start_count += 1
        if self.start_count > 1:
            return
        started = self.client.start_async()
        started.wait(timeout=timeout)
        if not self.client.connected:
            raise RuntimeError('could not connect to zookeeper')
        logger.debug('connected to zookeeper (version=%s)', '.'.join(map(str, self.client.server_version())))

    def on_stop(self):
        self.start_count -= 1
        if self.start_count != 0:
            return
        self.client.stop()

    def on_kazoo_state_change(self, state):
        logger.info('kazoo connection state changed to %s', state)

    def on_service_type_watch(self, service, event):
        try:
            if event.type == EventType.CHILD:
                # FIXME: figure out proper retry strategy
                self.client.retry(self.lookup, service.container, service)
        except Exception:
            logger.exception('error in service type watcher')

    def on_service_watch(self, service, event):
        try:
            prefix, service_type, identity = event.path.rsplit('/', 2)
            if event.type == EventType.DELETED:
                service.remove(identity)
        except Exception:
            logger.exception('error in service watcher')

    def _get_service_znode(self, service, service_type, identity):
        path = self._get_zk_path(service_type, identity)
        result = self.client.get_async(
            path, watch=functools.partial(self.on_service_watch, service))
        value, znode = result.get()
        items = six.iteritems(json.loads(value.decode('utf-8')))
        return {str(k): str(v) for k, v in items}

    def discover(self, container):
        result = self.client.get_children_async(
            path='%s/services' % self.chroot,
        )
        return list(result.get())

    def lookup(self, container, service, watch=True, timeout=1):
        def child_watch(event):
            print(event)
        service_type = service.service_type
        result = self.client.get_children_async(
            path='%s/services/%s' % (self.chroot, service_type),
            watch=functools.partial(self.on_service_type_watch, service),
        )
        try:
            names = result.get(timeout=timeout)
        except NoNodeError:
            raise LookupFailure(None, "failed to resolve %s" % service.service_type)
        logger.info("lookup %s %r", service_type, names)
        identities = set(service.identities())
        for name in names:
            kwargs = self._get_service_znode(service, service_type, name)
            identity = kwargs.pop('identity')
            service.update(identity, **kwargs)
            try:
                identities.remove(identity)
            except KeyError:
                pass
        for identity in identities:
            service.remove(identity)
        return service

    def _get_zk_path(self, service_type, identity):
        return '%s/services/%s/%s' % (self.chroot, service_type, identity)

    def register(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        value = json.dumps({
            'endpoint': container.endpoint,
            'identity': container.identity,
            'log_endpoint': container.log_endpoint,
        })
        result = self.client.create_async(
            path,
            value.encode('utf-8'),
            ephemeral=True, makepath=True)
        # FIXME: result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)

    def unregister(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        result = self.client.delete_async(path)
        result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)
Exemple #25
0
class _ZookeeperProxy(object):
    def __init__(self, address_provider: AddressListProvider, prefix: str):
        self.address_provider = address_provider
        self.async_counter = WaitingCounter(limit=100)
        self.conn_str = None
        self.client = None
        self.prefix = prefix
        self.hosts_cache = SlowlyUpdatedCache(
            self.address_provider.get_latest_address,
            self._update_hosts,
            30,  # Refresh every 30 seconds
            3 * 60)  # Update only after 180 seconds of stability

    def _update_hosts(self, value):
        hosts, port = value
        if hosts:
            self.conn_str = ','.join(['{}:{}'.format(h, port)
                                      for h in hosts]) + self.prefix
            if self.client is None:
                self.client = KazooClient(hosts=self.conn_str,
                                          command_retry={
                                              'deadline': 120,
                                              'max_delay': 1,
                                              'max_tries': -1
                                          },
                                          connection_retry={
                                              'max_delay': 1,
                                              'max_tries': -1
                                          })
                self.client.add_listener(self.session_listener)
            else:
                self.client.stop()
                self.client.set_hosts(self.conn_str)
            self.client.start()

    def terminate(self):
        if self.client:
            self.client.stop()

    def session_listener(self, state):
        pass

    def get_conn_str(self):
        return self.conn_str

    def get(self, *params):
        self.hosts_cache.touch()
        return self.client.retry(self.client.get, *params)

    def get_async(self, *params):
        # Exhibitor is not polled here and it's totally fine!
        self.async_counter.increment()
        try:
            i_async = self.client.get_async(*params)
            i_async.rawlink(self._decrement)
            return i_async
        except Exception as e:
            self._decrement()
            raise e

    def _decrement(self, *args, **kwargs):
        self.async_counter.decrement()

    def set(self, *args, **kwargs):
        self.hosts_cache.touch()
        return self.client.retry(self.client.set, *args, **kwargs)

    def create(self, *args, **kwargs):
        self.hosts_cache.touch()
        return self.client.retry(self.client.create, *args, **kwargs)

    def delete(self, *args, **kwargs):
        self.hosts_cache.touch()
        try:
            return self.client.retry(self.client.delete, *args, **kwargs)
        except NoNodeError:
            pass

    def get_children(self, *params):
        self.hosts_cache.touch()
        try:
            return self.client.retry(self.client.get_children, *params)
        except NoNodeError:
            return []

    def take_lock(self, *args, **kwargs):
        while True:
            try:
                self.hosts_cache.touch()
                return self.client.Lock(*args, **kwargs)
            except Exception as e:
                _LOG.error('Failed to obtain lock for exhibitor, retrying',
                           exc_info=e)
Exemple #26
0
class DeploymentConfig(object):
    """ Accesses deployment configuration options. """
    # The ZooKeeper node where configuration is stored.
    CONFIG_ROOT = '/appscale/config'

    def __init__(self, hosts):
        """ Creates new DeploymentConfig object.

    Args:
      hosts: A list of ZooKeeper hosts.
    """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.update_lock = Lock()
        self.state = ConfigStates.LOADING
        self.config = {}
        self.conn = KazooClient(hosts=hosts, read_only=True)
        self.conn.add_listener(self._conn_listener)
        self.conn.start()
        self.conn.ensure_path(self.CONFIG_ROOT)
        self.conn.ChildrenWatch(self.CONFIG_ROOT, func=self._update_config)

    def _conn_listener(self, state):
        """ Handles changes in ZooKeeper connection state.

    Args:
      state: A string indicating the new state.
    """
        if state == KazooState.LOST:
            self.logger.warning('ZK connection lost')
        if state == KazooState.SUSPENDED:
            self.logger.warning('ZK connection suspended')
        else:
            self.logger.info('ZK connection established')

    def _load_child(self, child):
        """ Fetches the data for a configuration node.

    Args:
      child: A string containing the ZooKeeper node to fetch.
    Returns:
      A dictionary containing configuration data.
    Raises:
      InaccessibleConfig if ZooKeeper is not accessible.
    """
        node = '/'.join([self.CONFIG_ROOT, child])
        try:
            data, _ = self.conn.retry(self.conn.get, node)
        except (KazooException, ZookeeperError):
            raise ConfigInaccessible('ZooKeeper connection not available')
        except NoNodeError:
            return {}

        try:
            return json.loads(data)
        except ValueError:
            self.logger.warning('Invalid deployment config: {}'.format(child))
            return {}

    def _update_config(self, children):
        """ Updates configuration when it changes.

    Args:
      children: A list of ZooKeeper nodes.
    """
        with self.update_lock:
            self.state = ConfigStates.LOADING

            # Ensure old sections are removed.
            self.config = {}

            for child in children:
                while True:
                    try:
                        self.config[child] = self._load_child(child)
                        break
                    except ConfigInaccessible as load_error:
                        self.logger.warning(str(load_error))
                        time.sleep(SMALL_WAIT)

            self.logger.info('Deployment configuration updated')
            self.state = ConfigStates.LOADED

    def get_config(self, section):
        """ Fetches the configuration for a given section.

    Args:
      section: A string specifying the section to fetch.
    Returns:
      A dictionary containing configuration data.
    Raises:
      InaccessibleConfig if ZooKeeper is inaccessible.
    """
        # If the connection is established, it should finish loading very soon.
        while (self.state == ConfigStates.LOADING and self.conn.state
               not in (KazooState.LOST, KazooState.SUSPENDED)):
            time.sleep(TINY_WAIT)

        if self.state != ConfigStates.LOADED:
            raise ConfigInaccessible('ZooKeeper connection not available')

        with self.update_lock:
            if section not in self.config:
                return {}
            return self.config[section]

    def close(self):
        """ Close the ZooKeeper connection. """
        self.conn.stop()
class ZkDefinitionWatcher:
    def __init__(self, zk_hosts, def_path='/env/?/sd/!'):
        c_retry = KazooRetry(-1, max_delay=60)
        self._zk = KazooClient(','.join(zk_hosts),
                               read_only=True,
                               connection_retry=c_retry)
        self._zk.start(timeout=10)

        self._child_watchers = {}
        self._data_watchers = {}

        # Note about "stack": the stack is a ZK path of where to find service definitions. It solves the problem of a
        # tree watcher watching an entire tree. A node is only watched if it adhears to the stack pattern. A "?" means
        # "watch children of whatever is here" (this must be the environment). A "!" means "watch data of whatever is
        # here" (this must be the service defs). Anything else is a constant, so it will only watch the children of that
        # specific node if it exists.
        self._stack = tuple(def_path.strip('/').split('/'))

        we = WatchedEvent(None, None, '/' + self._stack[0])
        self._watch_children(we)

    def poll(self):
        env_index = self._stack.index('?')
        name_index = self._stack.index('!')

        ret = {}
        for path, data in self._data_watchers.iteritems():
            # Get env from path
            env = path.strip('/').split('/')[env_index]
            # Get name from path
            name = path.strip('/').split('/')[name_index]

            data['path'] = path

            ret[(env, name)] = data

        return ret

    def _watch_children(self, event):
        # Called when a child node is deleted
        if event.type == EventType.DELETED:
            # Remove child watcher from our records
            del self._child_watchers[event.path]
            # remove datawatchers?
            return

        # Get children and set a child watch for the next event of this path
        children = self._zk.retry(self._zk.get_children,
                                  event.path,
                                  watch=self._watch_children)
        # Update our records
        self._child_watchers[event.path] = children

        # If no children, there is nothing to do; no watchers to set
        if len(children) == 0:
            return

        # Find location in stack for children
        level = len(event.path.strip('/').split('/'))
        child_depth_marker = self._stack[level]

        for child in children:
            path = "{0}/{1}".format(event.path, child)

            if child_depth_marker == '?' or child == child_depth_marker:
                # Set child_watcher for each child
                if path not in self._child_watchers:
                    we = WatchedEvent(None, None, path)
                    self._watch_children(we)

            elif child_depth_marker == '!':
                # Set data_watcher for each child
                if path not in self._data_watchers:
                    we = WatchedEvent(None, None, path)
                    self._watch_data(we)

    def _watch_data(self, event):
        # Called when a child node is deleted
        if event.type == EventType.DELETED:
            # Remove child watcher from our records
            del self._data_watchers[event.path]
            return

        # Get data and set a data watch for the next event of this path
        data, _stat = self._zk.retry(self._zk.get,
                                     event.path,
                                     watch=self._watch_data)

        # Update our records
        try:
            # TODO: validate JSON?
            parsed_data = json.loads(data)
            self._data_watchers[event.path] = parsed_data
        except ValueError:
            logger.warning('Service definition "' + event.path +
                           '" cannot be parsed as JSON')
            # If service def isn't in proper format, remove it from known service defs
            if event.path in self._data_watchers:
                del self._data_watchers[event.path]
Exemple #28
0
    if zk_client.exists('/node1'):
        print('存在节点node1,节点路径/node1')

    data, stat = zk_client.get('/node1')
    if stat:
        print("Version: %s, data: %s" % (stat.version, data.decode("utf-8")))

    children = zk_client.get_children('/node1')
    print('node1子节点 有 %s 子节点,节点名称为: %s' % (len(children), children))
    print('/ 子节点', zk_client.get_children('/'))

    zk_client.set('/node1/subNode2', b'some new data')
    zk_client.delete('/node1', recursive=True)

    try:
        result = zk_client.retry(zk_client.get, '/node1/subNode3')
        print(result)
        kr = KazooRetry(max_tries=3, ignore_expired=False)
        result = kr(zk_client.get, '/node1/subNode3')
    except Exception as e:
        print('/node1/subNode3 不存在,所以会运行出错')
        zk_client.stop()

    while zk_conn_status != 3:
        continue
    else:
        i = 0
        while i < 300:
            if i % 20 == 0:
                time.sleep(2)
                print('创建新节点')
Exemple #29
0
zk.start()

base_zk_path = '%s/%s' % (service_ns, service_id)

def resolve_path(path):
  rel_path = relpath(path, config_dir)
  return base_zk_path if rel_path == '.' else join(base_zk_path, rel_path)

if exists(config_dir) and isdir(config_dir):
  print >>stderr, 'Acquiring access lock...'
  with zk.Lock(base_zk_path + '.lock', node_id):
    for dirname, dirs, files in os.walk(config_dir):
      zk.ensure_path(resolve_path(dirname))
      print >>stderr, '  Directory zk://' + resolve_path(dirname)
      for filename in files:
        filename = join(dirname, filename)
        config_path = resolve_path(filename)
        value = open(filename, 'rb').read()
        if zk.exists(config_path):
          print >>stderr, '   Updating zk://%s from %s [%d bytes]' % (config_path, filename, len(value))
          zk.retry(zk.set, config_path, value)
        else:
          print >>stderr, '   Creating zk://%s from %s [%d bytes]' % (config_path, filename, len(value))
          zk.retry(zk.create, config_path, value)
else:
  print >>stderr, 'Invalid configuration directory'

success = True

zk.stop();
Exemple #30
0
class ZooKeeper(AbstractDCS):

    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self._client = KazooClient(hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']),
                                   timeout=config['ttl'], connection_retry={'max_delay': 1, 'max_tries': -1},
                                   command_retry={'deadline': config['retry_timeout'], 'max_delay': 1, 'max_tries': -1})
        self._client.add_listener(self.session_listener)

        self._my_member_data = None
        self._fetch_cluster = True
        self._last_leader_operation = 0

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, host, port):

        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(host, port)
        return max(self.loop_wait - 2, 2)*1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.event.set()

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    def set_retry_timeout(self, retry_timeout):
        self._client._retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner, value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self):
        members = []
        for member in self.get_children(self.members_path, self.cluster_watcher):
            data = self.get_node(self.members_path + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid)

        # get list of members
        members = self.load_members() if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner:
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]] or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner, member)
                self._fetch_cluster = member.index == -1

        # failover key
        failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version, failover[0])

        # get last leader operation
        optime = self.get_node(self.leader_optime_path) if self._OPTIME in nodes and self._fetch_cluster else None
        self._last_leader_operation = 0 if optime is None else int(optime[0])
        self._cluster = Cluster(initialize, config, leader, self._last_leader_operation, members, failover)

    def _load_cluster(self):
        if self._fetch_cluster or self._cluster is None:
            try:
                self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')

    def _create(self, path, value, **kwargs):
        try:
            self._client.retry(self._client.create, path, value.encode('utf-8'), **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def set_failover_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.failover_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return value == '' or (index is None and self._create(self.failover_path, value))
        except:
            logging.exception('set_failover_value')
            return False

    def set_config_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.config_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return index is None and self._create(self.config_path, value)
        except Exception:
            logging.exception('set_config_value')
            return False

    def initialize(self, create_new=True, sysid=""):
        return self._create(self.initialize_path, sysid, makepath=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path,  sysid.encode("utf-8"))

    def touch_member(self, data, ttl=None, permanent=False):
        cluster = self.cluster
        member = cluster and ([m for m in cluster.members if m.name == self._name] or [None])[0]
        data = data.encode('utf-8')
        if member and self._client.client_id is not None and member.session != self._client.client_id[0]:
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except:
                return False
            member = None

        if member:
            if data == self._my_member_data:
                return True
        else:
            try:
                self._client.create_async(self.member_path, data, makepath=True, ephemeral=not permanent).get(timeout=1)
                self._my_member_data = data
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path, data).get(timeout=1)
            self._my_member_data = data
            return True
        except:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def write_leader_optime(self, last_operation):
        last_operation = last_operation.encode('utf-8')
        if last_operation != self._last_leader_operation:
            try:
                self._client.set_async(self.leader_optime_path, last_operation).get(timeout=1)
                self._last_leader_operation = last_operation
            except NoNodeError:
                try:
                    self._client.create_async(self.leader_optime_path, last_operation, makepath=True).get(timeout=1)
                    self._last_leader_operation = last_operation
                except:
                    logger.exception('Failed to create %s', self.leader_optime_path)
            except:
                logger.exception('Failed to update %s', self.leader_optime_path)

    def update_leader(self):
        return True

    def delete_leader(self):
        self._client.restart()
        self._my_member_data = None
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete, self.client_path(''), recursive=True)
        except NoNodeError:
            return True

    def watch(self, timeout):
        if super(ZooKeeper, self).watch(timeout):
            self._fetch_cluster = True
        return self._fetch_cluster
Exemple #31
0
class SolrCloudManager:
    def __init__(self, zk_host):
        self.__zk = KazooClient(hosts=zk_host)
        self.__zk.start()

    def __del__(self):
        self.__zk.stop()

    def get_cluster_state(self):
        cs_tuple = self.__zk.retry(self.__zk.get, 'clusterstate.json')
        cs = json.loads(cs_tuple[0])
        return cs

    # Check all replicas that contain node_name
    # Return true if ALL nodes are in the active state
    def replicas_are_active(self, node_name):
        cluster_state = self.get_cluster_state()
        active = True
        for cn, cdata in cluster_state.iteritems():
            for sn, sdata in cdata['shards'].iteritems():
                replica_down = False
                node_in_replica = False
                for rn, rdata in sdata['replicas'].iteritems():
                    if rdata['node_name'] == node_name:
                        node_in_replica = True
                    if rdata['state'] != "active":
                        replica_down = True
                if replica_down and node_in_replica:
                    active = False
            if not active:
                break
        return active

    # Wait for all replicas to enter the active state
    def wait_for_replicas(self, node_name, timeout):
        start_time = time.time()
        ra = self.replicas_are_active(node_name)
        while ((start_time + timeout) > time.time()) and (not ra):
            print "Waiting for replication to finish"
            time.sleep(3)
            ra = self.replicas_are_active(node_name)
        return ra

    def node_is_live(self, node_name):
        live_nodes = self.__zk.retry(self.__zk.get_children, 'live_nodes')
        return (node_name in live_nodes)

    def wait_for_live_node(self, node_name, timeout):
        start_time = time.time()
        lv = self.node_is_live(node_name)
        while ((start_time + timeout) > time.time()) and (not lv):
            print "Waiting for live node"
            time.sleep(3)
            lv = self.node_is_live(node_name)
        return lv

    def _remove_live_node(self, node_name):
        print(green('Deleting: live_nodes/%s' % (node_name)))
        self.__zk.retry(self.__zk.delete, 'live_nodes/' + node_name)
        return True

    def _restart_host_solr_service(self, host):
        print(green('Restarting: %s' % (host)))
        result = sudo("restart solr-undertow")
        if result.failed:
            print(red('Failed to restart: %s' % (host)))
            return False
        return True

    def restart_host_solr(self,
                          host,
                          host_port='8983',
                          force=False,
                          ln_timeout=240,
                          rn_timeout=600):
        if host is None:
            return self._return_message(1, 'host is required')

        node_name = host + ':' + host_port + '_solr'
        if (not force) and (not self.node_is_live(node_name)):
            return self._return_message(10, 'Node is not live')

        # Don't restart if any other replicas are down
        if (not force) and (not self.replicas_are_active(node_name)):
            return self._return_message(20, 'Not all replicas are active')

        # LATER Make sure a reindex isn't in progress

        if not self._remove_live_node(node_name):
            return self._return_message(30, 'Error removing live node')

        if not self._restart_host_solr_service(host):
            return self._return_message(40, 'Error restarting solr service')

        if not self.wait_for_live_node(node_name, ln_timeout):
            return self._return_message(50, 'Timeout waiting for live node')

        if not self.wait_for_replicas(node_name, rn_timeout):
            return self._return_message(60, 'Timeout waiting for replicas')

    def _return_message(self, error_code, message):
        print(red({'status': error_code, 'message': message}))
        sys.exit(error_code)
Exemple #32
0
 try:
     result = None
     result = zk.set("/xy/test", b"some data")
 except Exception, e:
     print('exception when zk.set, %s' % e)
 else:
     print("zk.set /xy/test result %s" % str(result))
 
 
 # del
 result = zk.delete("/xy/test/node", recursive=True)
 print("zk.delete /xy/test/node result %s" % (result))
 
 # action
 try:
     result = zk.retry(zk.get, "/xy/test/nodex")
 except Exception, e:
     print('exception when zk.retry, %s' % e)
 else:
     print("zk.retry /xy/test/nodex result %s" % str(result))
 
 from kazoo.retry import KazooRetry
 kr = KazooRetry(max_tries=3, ignore_expire=False)
 try:
     result = kr(zk.get, "/xy/test/nodex")
 except Exception, e:
     print('exception when KazooRetry, %s' % e)
 else:
     print("KazooRetry zk.get /xy/test/nodex result %s" % (result))
     
 
Exemple #33
0
class _ZookeeperProxy(object):
    def __init__(self, address_provider: AddressListProvider, prefix: str):
        self.address_provider = address_provider
        self.async_counter = WaitingCounter(limit=100)
        self.conn_str = None
        self.client = None
        self.prefix = prefix
        self.hosts_cache = SlowlyUpdatedCache(
            self.address_provider.get_latest_address,
            self._update_hosts,
            30,  # Refresh every 30 seconds
            3 * 60)  # Update only after 180 seconds of stability

    def _update_hosts(self, value):
        hosts, port = value
        if hosts:
            self.conn_str = ','.join(['{}:{}'.format(h, port) for h in hosts]) + self.prefix
            if self.client is None:
                self.client = KazooClient(hosts=self.conn_str,
                                          command_retry={'deadline': 120, 'max_delay': 1, 'max_tries': -1},
                                          connection_retry={'max_delay': 1, 'max_tries': -1})
                self.client.add_listener(self.session_listener)
            else:
                self.client.stop()
                self.client.set_hosts(self.conn_str)
            self.client.start()

    def terminate(self):
        if self.client:
            self.client.stop()

    def session_listener(self, state):
        pass

    def get_conn_str(self):
        return self.conn_str

    def get(self, *params):
        self.hosts_cache.touch()
        return self.client.retry(self.client.get, *params)

    def get_async(self, *params):
        # Exhibitor is not polled here and it's totally fine!
        self.async_counter.increment()
        try:
            i_async = self.client.get_async(*params)
            i_async.rawlink(self._decrement)
            return i_async
        except Exception as e:
            self._decrement()
            raise e

    def _decrement(self, *args, **kwargs):
        self.async_counter.decrement()

    def set(self, *args, **kwargs):
        self.hosts_cache.touch()
        return self.client.retry(self.client.set, *args, **kwargs)

    def create(self, *args, **kwargs):
        self.hosts_cache.touch()
        return self.client.retry(self.client.create, *args, **kwargs)

    def delete(self, *args, **kwargs):
        self.hosts_cache.touch()
        try:
            return self.client.retry(self.client.delete, *args, **kwargs)
        except NoNodeError:
            pass

    def get_children(self, *args, **kwargs):
        self.hosts_cache.touch()
        try:
            return self.client.retry(self.client.get_children, *args, **kwargs)
        except NoNodeError:
            return []

    def take_lock(self, *args, **kwargs):
        while True:
            try:
                self.hosts_cache.touch()
                return self.client.Lock(*args, **kwargs)
            except Exception as e:
                _LOG.error('Failed to obtain lock for exhibitor, retrying', exc_info=e)