Example #1
0
class ZKClient(object):
    def __init__(self,
                 zk_hosts,
                 local_port,
                 local_host=None,
                 server_info_path='/crawlers'):
        self.zk_hosts = zk_hosts
        self.local_host = local_host
        if not self.local_host:
            self.local_host = get_local_host()
        self.server_info_path = server_info_path
        self.server_info_znode = '{server_info_path}/{ip}:{port}'.format(
            server_info_path=server_info_path,
            ip=self.local_host,
            port=local_port)
        self.connect_zk()

    def connect_zk(self):
        self.zk = KazooClient(hosts=self.zk_hosts)
        self.zk.add_listener(self.state_listener)
        self.zk.start()

    def update_heartbeat(self):
        def callback(async_stat):
            stat = async_stat.get()
            if stat:
                _ = self.zk.set_async(self.server_info_znode, ts)
            else:
                _ = self.zk.create_async(self.server_info_znode,
                                         ts,
                                         ephemeral=True,
                                         makepath=True)

        ts = str(int(time.time()))
        async_stat = self.zk.exists_async(self.server_info_znode, watch=None)
        async_stat.rawlink(callback)

    def state_listener(self, state):
        if state == KazooState.LOST:
            # Register somewhere that the session was lost
            pass
        elif state == KazooState.SUSPENDED:
            # Handle being disconnected from Zookeeper
            pass
        else:
            self.update_heartbeat()

    def add_watcher(self, request_handler):
        @self.zk.DataWatch(ZOO_CONFIG_PROXY_PATH)
        def proxy_change(data, stat):
            run_in_thread(request_handler.config_proxy_via_zookeeper, data)

    def close(self):
        self.zk.stop()
Example #2
0
class ZooKeeper(AbstractDCS):

    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self._client = KazooClient(hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']),
                                   timeout=config['ttl'], connection_retry={'max_delay': 1, 'max_tries': -1},
                                   command_retry={'deadline': config['retry_timeout'], 'max_delay': 1, 'max_tries': -1})
        self._client.add_listener(self.session_listener)

        self._my_member_data = None
        self._fetch_cluster = True
        self._last_leader_operation = 0

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, host, port):

        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(host, port)
        return max(self.loop_wait - 2, 2)*1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.event.set()

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    def set_retry_timeout(self, retry_timeout):
        self._client._retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner, value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self):
        members = []
        for member in self.get_children(self.members_path, self.cluster_watcher):
            data = self.get_node(self.members_path + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid)

        # get list of members
        members = self.load_members() if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner:
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]] or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner, member)
                self._fetch_cluster = member.index == -1

        # failover key
        failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version, failover[0])

        # get last leader operation
        optime = self.get_node(self.leader_optime_path) if self._OPTIME in nodes and self._fetch_cluster else None
        self._last_leader_operation = 0 if optime is None else int(optime[0])
        self._cluster = Cluster(initialize, config, leader, self._last_leader_operation, members, failover)

    def _load_cluster(self):
        if self._fetch_cluster or self._cluster is None:
            try:
                self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')

    def _create(self, path, value, **kwargs):
        try:
            self._client.retry(self._client.create, path, value.encode('utf-8'), **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def set_failover_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.failover_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return value == '' or (index is None and self._create(self.failover_path, value))
        except:
            logging.exception('set_failover_value')
            return False

    def set_config_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.config_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return index is None and self._create(self.config_path, value)
        except Exception:
            logging.exception('set_config_value')
            return False

    def initialize(self, create_new=True, sysid=""):
        return self._create(self.initialize_path, sysid, makepath=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path,  sysid.encode("utf-8"))

    def touch_member(self, data, ttl=None, permanent=False):
        cluster = self.cluster
        member = cluster and ([m for m in cluster.members if m.name == self._name] or [None])[0]
        data = data.encode('utf-8')
        if member and self._client.client_id is not None and member.session != self._client.client_id[0]:
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except:
                return False
            member = None

        if member:
            if data == self._my_member_data:
                return True
        else:
            try:
                self._client.create_async(self.member_path, data, makepath=True, ephemeral=not permanent).get(timeout=1)
                self._my_member_data = data
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path, data).get(timeout=1)
            self._my_member_data = data
            return True
        except:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def write_leader_optime(self, last_operation):
        last_operation = last_operation.encode('utf-8')
        if last_operation != self._last_leader_operation:
            try:
                self._client.set_async(self.leader_optime_path, last_operation).get(timeout=1)
                self._last_leader_operation = last_operation
            except NoNodeError:
                try:
                    self._client.create_async(self.leader_optime_path, last_operation, makepath=True).get(timeout=1)
                    self._last_leader_operation = last_operation
                except:
                    logger.exception('Failed to create %s', self.leader_optime_path)
            except:
                logger.exception('Failed to update %s', self.leader_optime_path)

    def update_leader(self):
        return True

    def delete_leader(self):
        self._client.restart()
        self._my_member_data = None
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete, self.client_path(''), recursive=True)
        except NoNodeError:
            return True

    def watch(self, timeout):
        if super(ZooKeeper, self).watch(timeout):
            self._fetch_cluster = True
        return self._fetch_cluster
Example #3
0
class ZookeeperServiceRegistry(BaseServiceRegistry):
    def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT):
        super(ZookeeperServiceRegistry, self).__init__()
        self.chroot = chroot
        self.client = KazooClient(
            hosts=hosts,
            handler=SequentialGeventHandler(),
        )
        self.client.add_listener(self.on_kazoo_state_change)
        self.start_count = 0

    @classmethod
    def from_config(cls, config, **kwargs):
        return cls(hosts=config.get('hosts', DEFAULT_HOSTS),
                   chroot=config.get('chroot', DEFAULT_CHROOT),
                   **kwargs)

    def on_start(self, timeout=10):
        self.start_count += 1
        if self.start_count > 1:
            return
        started = self.client.start_async()
        started.wait(timeout=timeout)
        if not self.client.connected:
            raise RuntimeError('could not connect to zookeeper')
        logger.debug('connected to zookeeper (version=%s)',
                     '.'.join(map(str, self.client.server_version())))

    def on_stop(self):
        self.start_count -= 1
        if self.start_count != 0:
            return
        self.client.stop()

    def on_kazoo_state_change(self, state):
        logger.info('kazoo connection state changed to %s', state)

    def on_service_type_watch(self, service, event):
        try:
            if event.type == EventType.CHILD:
                # FIXME: figure out proper retry strategy
                self.client.retry(self.lookup, service.container, service)
        except Exception:
            logger.exception('error in service type watcher')

    def on_service_watch(self, service, event):
        try:
            prefix, service_type, identity = event.path.rsplit('/', 2)
            if event.type == EventType.DELETED:
                service.remove(identity)
        except Exception:
            logger.exception('error in service watcher')

    def _get_service_znode(self, service, service_type, identity):
        path = self._get_zk_path(service_type, identity)
        result = self.client.get_async(path,
                                       watch=functools.partial(
                                           self.on_service_watch, service))
        value, znode = result.get()
        items = six.iteritems(json.loads(value.decode('utf-8')))
        return {str(k): str(v) for k, v in items}

    def discover(self, container):
        result = self.client.get_children_async(path='%s/services' %
                                                self.chroot, )
        return list(result.get())

    def lookup(self, container, service, watch=True, timeout=1):
        def child_watch(event):
            print(event)

        service_type = service.service_type
        result = self.client.get_children_async(
            path='%s/services/%s' % (self.chroot, service_type),
            watch=functools.partial(self.on_service_type_watch, service),
        )
        try:
            names = result.get(timeout=timeout)
        except NoNodeError:
            raise LookupFailure(None,
                                "failed to resolve %s" % service.service_type)
        logger.info("lookup %s %r", service_type, names)
        identities = set(service.identities())
        for name in names:
            kwargs = self._get_service_znode(service, service_type, name)
            identity = kwargs.pop('identity')
            service.update(identity, **kwargs)
            try:
                identities.remove(identity)
            except KeyError:
                pass
        for identity in identities:
            service.remove(identity)
        return service

    def _get_zk_path(self, service_type, identity):
        return '%s/services/%s/%s' % (self.chroot, service_type, identity)

    def register(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        value = json.dumps({
            'endpoint': container.endpoint,
            'identity': container.identity,
            'log_endpoint': container.log_endpoint,
        })
        result = self.client.create_async(path,
                                          value.encode('utf-8'),
                                          ephemeral=True,
                                          makepath=True)
        # FIXME: result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)

    def unregister(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        result = self.client.delete_async(path)
        result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)
Example #4
0
class ZooKeeper(AbstractDCS):
    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        mapping = {
            'use_ssl': 'use_ssl',
            'verify': 'verify_certs',
            'cacert': 'ca',
            'cert': 'certfile',
            'key': 'keyfile',
            'key_password': '******'
        }
        kwargs = {v: config[k] for k, v in mapping.items() if k in config}

        self._client = KazooClient(
            hosts,
            handler=PatroniSequentialThreadingHandler(config['retry_timeout']),
            timeout=config['ttl'],
            connection_retry=KazooRetry(max_delay=1,
                                        max_tries=-1,
                                        sleep_func=time.sleep),
            command_retry=KazooRetry(deadline=config['retry_timeout'],
                                     max_delay=1,
                                     max_tries=-1,
                                     sleep_func=time.sleep),
            **kwargs)
        self._client.add_listener(self.session_listener)

        self._fetch_cluster = True
        self._fetch_optime = True

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, *args):
        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(*args)
        return max(self.loop_wait - 2, 2) * 1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def optime_watcher(self, event):
        self._fetch_optime = True
        self.event.set()

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.optime_watcher(event)

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    @property
    def ttl(self):
        return self._client._session_timeout

    def set_retry_timeout(self, retry_timeout):
        retry = self._client.retry if isinstance(
            self._client.retry, KazooRetry) else self._client._retry
        retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    def get_leader_optime(self, leader):
        watch = self.optime_watcher if not leader or leader.name != self._name else None
        optime = self.get_node(self.leader_optime_path, watch)
        self._fetch_optime = False
        return optime and int(optime[0]) or 0

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner,
                                value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self, sync_standby):
        members = []
        for member in self.get_children(self.members_path,
                                        self.cluster_watcher):
            watch = member in sync_standby and self.cluster_watcher or None
            data = self.get_node(self.members_path + member, watch)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(
            self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path)
                      or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(
            self.config_path,
            watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version,
                                                    config[0], config[1].mzxid)

        # get timeline history
        history = self.get_node(
            self.history_path,
            watch=self.cluster_watcher) if self._HISTORY in nodes else None
        history = history and TimelineHistory.from_node(
            history[1].mzxid, history[0])

        # get synchronization state
        sync = self.get_node(
            self.sync_path,
            watch=self.cluster_watcher) if self._SYNC in nodes else None
        sync = SyncState.from_node(sync and sync[1].version, sync and sync[0])

        # get list of members
        sync_standby = sync.leader == self._name and sync.members or []
        members = self.load_members(
            sync_standby) if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(
            self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if not self._ctl and leader[0] == self._name and client_id is not None \
                    and client_id[0] != leader[1].ephemeralOwner:
                logger.info(
                    'I am leader but not owner of the session. Removing leader node'
                )
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]]
                          or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner,
                                member)
                self._fetch_cluster = member.index == -1

        # get last leader operation
        last_leader_operation = self._OPTIME in nodes and self.get_leader_optime(
            leader)

        # failover key
        failover = self.get_node(
            self.failover_path,
            watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version,
                                                   failover[0])

        return Cluster(initialize, config, leader, last_leader_operation,
                       members, failover, sync, history)

    def _load_cluster(self):
        cluster = self.cluster
        if self._fetch_cluster or cluster is None:
            try:
                cluster = self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')
        # Optime ZNode was updated or doesn't exist and we are not leader
        elif (self._fetch_optime and not self._fetch_cluster or not cluster.last_leader_operation) and\
                not (cluster.leader and cluster.leader.name == self._name):
            try:
                optime = self.get_leader_optime(cluster.leader)
                cluster = Cluster(cluster.initialize, cluster.config,
                                  cluster.leader, optime, cluster.members,
                                  cluster.failover, cluster.sync,
                                  cluster.history)
            except Exception:
                pass
        return cluster

    def _bypass_caches(self):
        self._fetch_cluster = True

    def _create(self, path, value, retry=False, ephemeral=False):
        try:
            if retry:
                self._client.retry(self._client.create,
                                   path,
                                   value,
                                   makepath=True,
                                   ephemeral=ephemeral)
            else:
                self._client.create_async(path,
                                          value,
                                          makepath=True,
                                          ephemeral=ephemeral).get(timeout=1)
            return True
        except Exception:
            logger.exception('Failed to create %s', path)
        return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path,
                           self._name.encode('utf-8'),
                           retry=True,
                           ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def _set_or_create(self,
                       key,
                       value,
                       index=None,
                       retry=False,
                       do_not_create_empty=False):
        value = value.encode('utf-8')
        try:
            if retry:
                self._client.retry(self._client.set,
                                   key,
                                   value,
                                   version=index or -1)
            else:
                self._client.set_async(key, value, version=index
                                       or -1).get(timeout=1)
            return True
        except NoNodeError:
            if do_not_create_empty and not value:
                return True
            elif index is None:
                return self._create(key, value, retry)
            else:
                return False
        except Exception:
            logger.exception('Failed to update %s', key)
        return False

    def set_failover_value(self, value, index=None):
        return self._set_or_create(self.failover_path, value, index)

    def set_config_value(self, value, index=None):
        return self._set_or_create(self.config_path, value, index, retry=True)

    def initialize(self, create_new=True, sysid=""):
        sysid = sysid.encode('utf-8')
        return self._create(self.initialize_path, sysid, retry=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path, sysid)

    def touch_member(self, data, permanent=False):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name,
                                                fallback_to_leader=False)
        encoded_data = json.dumps(data, separators=(',', ':')).encode('utf-8')
        if member and (self._client.client_id is not None
                       and member.session != self._client.client_id[0] or
                       not (deep_compare(member.data.get('tags', {}),
                                         data.get('tags', {})) and
                            member.data.get('version') == data.get('version')
                            and member.data.get('checkpoint_after_promote')
                            == data.get('checkpoint_after_promote'))):
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except Exception:
                return False
            member = None

        if member:
            if deep_compare(data, member.data):
                return True
        else:
            try:
                self._client.create_async(
                    self.member_path,
                    encoded_data,
                    makepath=True,
                    ephemeral=not permanent).get(timeout=1)
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path,
                                   encoded_data).get(timeout=1)
            return True
        except Exception:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def _write_leader_optime(self, last_operation):
        return self._set_or_create(self.leader_optime_path, last_operation)

    def _update_leader(self):
        return True

    def _delete_leader(self):
        self._client.restart()
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except Exception:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete,
                                      self.client_path(''),
                                      recursive=True)
        except NoNodeError:
            return True

    def set_history_value(self, value):
        return self._set_or_create(self.history_path, value)

    def set_sync_state_value(self, value, index=None):
        return self._set_or_create(self.sync_path,
                                   value,
                                   index,
                                   retry=True,
                                   do_not_create_empty=True)

    def delete_sync_state(self, index=None):
        return self.set_sync_state_value("{}", index)

    def watch(self, leader_index, timeout):
        if super(ZooKeeper, self).watch(leader_index,
                                        timeout) and not self._fetch_optime:
            self._fetch_cluster = True
        return self._fetch_cluster
Example #5
0
class ZookeeperServiceRegistry(BaseServiceRegistry):
    def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT):
        super(ZookeeperServiceRegistry, self).__init__()
        self.chroot = chroot
        self.client = KazooClient(
            hosts=hosts,
            handler=SequentialGeventHandler(),
        )
        self.client.add_listener(self.on_kazoo_state_change)
        self.start_count = 0

    @classmethod
    def from_config(cls, config, **kwargs):
        return cls(
            hosts=config.get('hosts', DEFAULT_HOSTS),
            chroot=config.get('chroot', DEFAULT_CHROOT),
            **kwargs
        )

    def on_start(self, timeout=10):
        self.start_count += 1
        if self.start_count > 1:
            return
        started = self.client.start_async()
        started.wait(timeout=timeout)
        if not self.client.connected:
            raise RuntimeError('could not connect to zookeeper')
        logger.debug('connected to zookeeper (version=%s)', '.'.join(map(str, self.client.server_version())))

    def on_stop(self):
        self.start_count -= 1
        if self.start_count != 0:
            return
        self.client.stop()

    def on_kazoo_state_change(self, state):
        logger.info('kazoo connection state changed to %s', state)

    def on_service_type_watch(self, service, event):
        try:
            if event.type == EventType.CHILD:
                # FIXME: figure out proper retry strategy
                self.client.retry(self.lookup, service.container, service)
        except Exception:
            logger.exception('error in service type watcher')

    def on_service_watch(self, service, event):
        try:
            prefix, service_type, identity = event.path.rsplit('/', 2)
            if event.type == EventType.DELETED:
                service.remove(identity)
        except Exception:
            logger.exception('error in service watcher')

    def _get_service_znode(self, service, service_type, identity):
        path = self._get_zk_path(service_type, identity)
        result = self.client.get_async(
            path, watch=functools.partial(self.on_service_watch, service))
        value, znode = result.get()
        items = six.iteritems(json.loads(value.decode('utf-8')))
        return {str(k): str(v) for k, v in items}

    def discover(self, container):
        result = self.client.get_children_async(
            path='%s/services' % self.chroot,
        )
        return list(result.get())

    def lookup(self, container, service, watch=True, timeout=1):
        def child_watch(event):
            print(event)
        service_type = service.service_type
        result = self.client.get_children_async(
            path='%s/services/%s' % (self.chroot, service_type),
            watch=functools.partial(self.on_service_type_watch, service),
        )
        try:
            names = result.get(timeout=timeout)
        except NoNodeError:
            raise LookupFailure(None, "failed to resolve %s" % service.service_type)
        logger.info("lookup %s %r", service_type, names)
        identities = set(service.identities())
        for name in names:
            kwargs = self._get_service_znode(service, service_type, name)
            identity = kwargs.pop('identity')
            service.update(identity, **kwargs)
            try:
                identities.remove(identity)
            except KeyError:
                pass
        for identity in identities:
            service.remove(identity)
        return service

    def _get_zk_path(self, service_type, identity):
        return '%s/services/%s/%s' % (self.chroot, service_type, identity)

    def register(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        value = json.dumps({
            'endpoint': container.endpoint,
            'identity': container.identity,
            'log_endpoint': container.log_endpoint,
        })
        result = self.client.create_async(
            path,
            value.encode('utf-8'),
            ephemeral=True, makepath=True)
        # FIXME: result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)

    def unregister(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        result = self.client.delete_async(path)
        result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)
Example #6
0
    MASTER_COUNT += 1
    SLAVE_COUNT += 1

    WORKER_COUNT += 1
    container = client.containers.run(
        "workers:latest",
        detach=True,
        name="worker_container" + str(WORKER_COUNT),
        network="orch-network",
        command=["sh", "-c", "service mongodb start; python3 worker.py 1"])
    pid = p_client.inspect_container(container.name)['State']['Pid']
    MASTER_LIST.append(container)
    message = ("running " + str(pid)).encode()
    zk.create("/worker/master", message, makepath=True)
    zk.create("/worker/master/" + str(pid), b"running")

    WORKER_COUNT += 1
    container = client.containers.run(
        "workers:latest",
        detach=True,
        name="worker_container" + str(WORKER_COUNT),
        network="orch-network",
        command=["sh", "-c", "service mongodb start; python3 worker.py 0"])
    pid = p_client.inspect_container(container.name)['State']['Pid']
    SLAVE_LIST.append(container)
    message = ("running " + str(pid)).encode()
    zk.create_async("/worker/slave", message, makepath=True)
    zk.create_async("/worker/slave/" + str(pid), b"running")

    app.run(host='0.0.0.0', debug=False)
Example #7
0
class Zookeeper:
    def __init__(self, hosts):
        self.zk = KazooClient(hosts=hosts,
                              handler=SequentialGeventHandler(),
                              logger=logger)
        # returns immediately
        event = self.zk.start_async()

        # Wait for 30 seconds and see if we're connected
        event.wait(timeout=30)
        try:
            if not self.zk.connected:
                # Not connected, stop trying to connect
                self.zk.stop()
        except (ConnectionLossException, NoAuthException) as error:
            raise error
        except Exception as error:
            raise error

    @coroutine
    def get_children(self, node):
        try:
            children = self.zk.get_children_async(node)
            raise Return(children.get())
        except Exception as error:
            raise error

    @coroutine
    def get_node(self, node):
        try:
            data = self.zk.get_async(node)
            raise Return(data.get())
        except Exception as error:
            raise error

    @coroutine
    def check_path_exist(self, path):
        try:
            result = self.zk.exists(path)
            if result:
                raise Return(True)
            else:
                raise Return(False)
        except Exception as error:
            raise error

    @coroutine
    def create_path(self, path):
        try:
            result = self.zk.ensure_path_async(path)
            raise Return(result.get())
        except Exception as error:
            raise error

    @coroutine
    def create_node(self, path, value):
        try:
            result = self.zk.create_async(path=path,
                                          value=value,
                                          acl=None,
                                          ephemeral=True)
            raise Return(result.get())
        except Exception as error:
            raise error

    @coroutine
    def update_node(self, path, value, version=-1):
        try:
            result = self.zk.set_async(path, value, version)
            raise Return(result.get())
        except Exception as error:
            raise error

    @coroutine
    def update_node(self, path, value, version=-1):
        try:
            result = self.zk.set_async(path, value, version)
            raise Return(result.get())
        except Exception as error:
            raise error

    @coroutine
    def del_node(self, node):
        try:
            node_info = self.zk.delete_async(node)
            raise Return(node_info.get())
        except Exception as error:
            raise error

    def close(self):
        self.zk.stop()
Example #8
0
class Zooconf:
    connection = None
    webService = None
    applicationService = None
    storageServicesList = None
    authenticationServiceList = None
    status = None

    def initialize(self):
        self.__zooConnect()
        self.__publishService()
        self.__initAuthenticationServiceWatches()
        self.__initStorageServiceWatches()
        self.heartbeat()

    def __zooConnect(self):
        print("Connecting to ZooKeeper")
        self.connection = KazooClient(hosts=settings.ZOOKEEPER_HOST)
        self.connection.start()
        digest_auth = "%s:%s" % (settings.ZOOKEEPER_USER,
                                 settings.ZOOKEEPER_PASSWORD)
        self.connection.add_auth("digest", digest_auth)

    def __publishService(self):
        acl = make_digest_acl(settings.ZOOKEEPER_USER,
                              settings.ZOOKEEPER_PASSWORD,
                              all=True)
        dataJsonDict = {
            'SERVER_HOSTNAME': settings.SERVER_HOSTNAME,
            'SERVER_PORT': settings.SERVER_PORT,
            'SHARED_KEY_BASE_64': settings.SHARED_KEY_BASE_64,
            'CHILDREN': []
        }
        if settings.ZOOKEEPER_PATH_TO_NODE == 'Auth/':
            dataJsonDict['AUTH_SYSTEM'] = settings.AUTH_SYSTEM
        if self.connection.exists(path=settings.ZOOKEEPER_ROOT +
                                  settings.ZOOKEEPER_PATH_TO_NODE +
                                  settings.ZOOKEEPER_NODE_ID):
            self.connection.set(
                path=settings.ZOOKEEPER_ROOT +
                settings.ZOOKEEPER_PATH_TO_NODE + settings.ZOOKEEPER_NODE_ID,
                value=json.JSONEncoder().encode(dataJsonDict).encode())
        else:
            self.connection.create_async(
                path=settings.ZOOKEEPER_ROOT +
                settings.ZOOKEEPER_PATH_TO_NODE + settings.ZOOKEEPER_NODE_ID,
                value=json.JSONEncoder().encode(dataJsonDict).encode(),
                ephemeral=settings.ZOOKEEPER_NODE_EPHIMERAL)
        if settings.ZOOKEEPER_PATH_TO_NODE != '':
            data, stat = self.connection.get(settings.ZOOKEEPER_ROOT +
                                             settings.ZOOKEEPER_PATH_TO_NODE)
            dataJsonDict = json.loads(data.decode("utf-8"))
            if settings.ZOOKEEPER_NODE_ID not in dataJsonDict['CHILDREN']:
                dataJsonDict['CHILDREN'].append(settings.ZOOKEEPER_NODE_ID)
            self.connection.set(
                path=settings.ZOOKEEPER_ROOT + settings.ZOOKEEPER_PATH_TO_NODE,
                value=json.JSONEncoder().encode(dataJsonDict).encode())

    def __initStorageServiceWatches(self):
        self.storageServicesList = []

        @self.connection.ChildrenWatch(settings.ZOOKEEPER_ROOT +
                                       "StorageServices")
        def watch_children(children):
            print(self.readTree())
            if settings.ZOOKEEPER_PATH_TO_NODE == "StorageServices/":
                node = {
                    'SERVER_HOSTNAME': settings.SERVER_HOSTNAME,
                    'SERVER_PORT': settings.SERVER_PORT,
                    'CHILDREN': []
                }
                if node not in self.storageServicesList:
                    self.__publishService()

    def __initAuthenticationServiceWatches(self):
        self.authenticationServiceList = []

        @self.connection.ChildrenWatch(settings.ZOOKEEPER_ROOT + "Auth")
        def watch_children(children):
            print(self.readTree())
            if settings.ZOOKEEPER_PATH_TO_NODE == "Auth/":
                node = {
                    'SERVER_HOSTNAME': settings.SERVER_HOSTNAME,
                    'SERVER_PORT': settings.SERVER_PORT,
                    'SHARED_KEY_BASE_64': settings.SHARED_KEY_BASE_64,
                    'CHILDREN': [],
                    'AUTH_SYSTEM': settings.AUTH_SYSTEM
                }
                if node not in self.authenticationServiceList:
                    self.__publishService()

    def getAvailableFs(self):
        return self.storageServicesList

    def getZooConnection(self):
        return self.connection

    def readTree(self):
        result = "{"
        rootChildren = self.connection.get_children(settings.ZOOKEEPER_ROOT)
        for child in rootChildren:
            data, stat = self.connection.get(settings.ZOOKEEPER_ROOT + child)
            result += '"' + child + '": ' + data.decode("utf-8") + ","
            try:
                grandchildren = self.connection.get_children(
                    settings.ZOOKEEPER_ROOT + child)
                for grandchild in grandchildren:
                    data, stat = self.connection.get(settings.ZOOKEEPER_ROOT +
                                                     child + '/' + grandchild)
                    result += '"' + grandchild + '": ' + data.decode(
                        "utf-8") + ","
                data, stat = self.connection.get(settings.ZOOKEEPER_ROOT +
                                                 child + '/')
                dataJsonDict = json.loads(data.decode("utf-8"))
                dataJsonDict['CHILDREN'] = grandchildren
                self.connection.set(
                    path=settings.ZOOKEEPER_ROOT + child,
                    value=json.JSONEncoder().encode(dataJsonDict).encode())
            except Exception:
                pass
        result = result[:-1] + '}'
        self.status = json.loads(result)
        self.initZkTree()
        return self.status

    def getStatus(self):
        try:
            return self.readTree()
        except Exception:
            self.__zooConnect()
            return self.getStatus()

    def getNodeData(self, node):
        try:
            return self.status[node]
        except Exception:
            return {}

    def initZkTree(self):
        serviceData = self.getNodeData('WebService')
        self.webService = serviceData['SERVER_HOSTNAME'] + ':' + serviceData[
            'SERVER_PORT'] + '/'

        serviceData = self.getNodeData('ApplicationService')
        self.applicationService = serviceData[
            'SERVER_HOSTNAME'] + ':' + serviceData['SERVER_PORT'] + '/'

        serviceData = self.getNodeData('Auth')
        self.authenticationServiceList = []
        for authService in serviceData['CHILDREN']:
            authServiceData = self.getNodeData(authService)
            if authServiceData != {}:
                baseUrl = authServiceData['SERVER_HOSTNAME'] + (
                    (':' + authServiceData['SERVER_PORT'])
                    if authServiceData['SERVER_PORT'] != '' else '') + '/'
                loginUrl = baseUrl + 'login?system=' + authServiceData[
                    'AUTH_SYSTEM'] + '&callback='
                registerUrl = baseUrl + 'register?system=' + authServiceData[
                    'AUTH_SYSTEM'] + '&callback='
                authServiceDict = {
                    'name': authService,
                    'url': baseUrl,
                    'loginUrl': loginUrl,
                    'registerUrl': registerUrl,
                    'sharedKey': authServiceData['SHARED_KEY_BASE_64'],
                    'system': authServiceData['AUTH_SYSTEM']
                }
                self.authenticationServiceList.append(authServiceDict)

        serviceData = self.getNodeData('StorageServices')
        self.storageServicesList = []
        for storageService in serviceData['CHILDREN']:
            storageServiceData = self.getNodeData(storageService)
            if storageServiceData != {}:
                storageServiceDict = {
                    'name':
                    storageService,
                    'url':
                    storageServiceData['SERVER_HOSTNAME'] + ':' +
                    storageServiceData['SERVER_PORT'] + '/'
                }
                self.storageServicesList.append(storageServiceDict)

    def heartbeat(self):
        threading.Timer(300.0, self.heartbeat).start()
        print("Heartbeat")
        print(str(self.getStatus()))
class ZookeeperLeaderElector:
    zk = None
    default_node_path = None
    zookeeper_connection_url = None
    config_file_dir = None

    def __init__(self):
        # Default zookeeper node path to CRUD
        self.default_node_path = "/cluster"

        # Locate config file
        self._get_config_file_addr()

        # Get the subset of properties relevant to zookeeper
        self._init_config_props()

        # Init zookeeper client instance
        self.zk = KazooClient(self.zookeeper_connection_url)

    # Locate config file
    def _get_config_file_addr(self):
        current_dir = path.dirname(path.realpath(__file__))
        parent_dir = path.dirname(current_dir)
        self.config_file_dir = path.join(path.dirname(parent_dir), 'config',
                                         'zookeeper.config')

    # Get the subset of properties relevant to zookeeper
    def _init_config_props(self):
        server_props = ConfigParser()
        server_props.read(self.config_file_dir)
        self.zookeeper_connection_url = server_props["connect"]["url"]

    def startup(self, server_id, host_ip, broker_server_start):
        # Connect to zookeeper server
        self.zk.start()

        try:
            # Ensure a path, create if necessary
            self.zk.ensure_path(self.default_node_path)

            # Create a node with data
            node = "node" + server_id
            node_path = self.default_node_path + '/' + node
            self.zk.create_async(node_path,
                                 bytes(host_ip, 'utf-8'),
                                 ephemeral=True)

            # Elect for leadership
            print("[SETUP/ZK] Elect for leadership ...")
            election = self.zk.Election(self.default_node_path, node)
            election.run(broker_server_start)

        # Exit
        except KeyboardInterrupt:
            self.exit()

        # Alwasys stop the zk instance and disconnect
        finally:
            self.zk.stop()
            self.zk.close()

    def exit(self):
        print("[EXIT] Disconnect from zookeeper server.")
        raise KeyboardInterrupt

    def ready(self):
        if not self.config_file_dir:
            sys.exit("[ERR] Doesn't find the config file.")
        elif not self.zookeeper_connection_url:
            sys.exit("[ERR] Zookeeper server url is EMPTY.")
        elif not self.zk:
            sys.exit("[ERR] Zookeeper instance instantiation FAILED.")

        return True
Example #10
0
class ZooHandler(object):
    def __init__(self):
        self.zookeeper_client = None
        if not settings.ZOOKEEPER_SETTING['enable']:
            logging.info('zookeeper disabled')
            return
        self.zoo_hosts = settings.ZOOKEEPER_SETTING['server_address']
        logging.info('start zookeeper client, zoo hosts: %s' % self.zoo_hosts)
        self.base_dir = settings.ZOOKEEPER_SETTING['base_dir']
        self.zookeeper_client = KazooClient(hosts=self.zoo_hosts)
        self.zookeeper_client.add_listener(self.state_listener)
        self.zookeeper_client.start_async()

    def state_listener(self, state):
        # session was lost
        if state == KazooState.LOST:
            logging.error('zookeeper lost!')
        # disconnected from Zookeeper
        elif state == KazooState.SUSPENDED:
            logging.error('zookeeper disconnected!')
        # connected/reconnected to Zookeeper
        elif state == KazooState.CONNECTED:
            self.register_node()
            logging.warn('zookeeper reconnected! try to register')
        else:
            logging.error('unexpected zookeeper state!!!')
            logging.critical('unexpected zookeeper state!!!')

    def register_node(self):
        if not self.zookeeper_client or not self.zookeeper_client.connected:
            logging.error('zoo not connected, register cancel')
            return
        path = ZooHandler.get_register_path()
        try:
            # 尝试注册节点
            def try_to_create_node(result):
                logging.info('zoo try_to_create_noe called')
                try:
                    # None表示节点不存在
                    if result.value is None:
                        self.zookeeper_client.create_async(path,
                                                           makepath=True,
                                                           ephemeral=True)
                    elif result.exception:
                        logging.fatal(
                            'critical error when try to check node when reconnected, %s',
                            result.exception)
                    else:
                        logging.warn(
                            'node already exists when reconnect and try to register'
                        )
                except BaseException as e:
                    logging.exception('critical error, %s', e.message)

            # 监控节点变化
            def node_watcher(watch_event):
                logging.info('zoo node_watcher called')
                try:
                    if EventType.DELETED == watch_event.type:
                        logging.warn('zoo nodes deleted, try recreate')
                        self.zookeeper_client.create_async(path,
                                                           makepath=True,
                                                           ephemeral=True)
                    if EventType.CHANGED == watch_event.type:
                        logging.warn('zoo nodes changed,do nothing')
                    if EventType.CHILD == watch_event.type:
                        logging.warn('zoo nodes childed,do nothing')
                    if EventType.CREATED == watch_event.type:
                        logging.info('zoo nodes success created')
                    if EventType.NONE == watch_event.type:
                        logging.error('zoo nodes status return None')
                finally:
                    self.zookeeper_client.exists_async(path,
                                                       watch=node_watcher)

            future = self.zookeeper_client.exists_async(path,
                                                        watch=node_watcher)
            future.rawlink(try_to_create_node)
        except ZookeeperError as e:
            logging.exception('zookeeper exception when register node: %s' %
                              e.message)
        except BaseException as e:
            logging.exception('critical error!')

    # 1. remove nodes,stop client
    def stop(self):
        logging.info('stopping zookeeper client')
        if self.zookeeper_client:
            self.zookeeper_client.remove_listener(self.state_listener)
            self.zookeeper_client.stop()
            logging.info('zookeeper stopped')

    @staticmethod
    def get_register_path():
        base_dir = settings.ZOOKEEPER_SETTING['base_dir']
        if base_dir[-1] == '/':
            base_dir = base_dir[0:-1]
        register_name = "%s/%s:%s:%s" % (
            base_dir, settings.ZOOKEEPER_SETTING['local_name'],
            settings.ZOOKEEPER_SETTING['local_ip'],
            settings.HTTP_SERVER_SETTING['port'])
        return register_name
Example #11
0
                 timeout=1,
                 handler=SequentialGeventHandler())
event = zk.start_async()

event.wait(timeout=1)  # wait()方法等待start_async()返回的事件对象

if not zk.connected:  # 由于可能永远连接失败,因此判断连接状态,做异常情况处理
    zk.stop()
    raise Exception("Unable to connect")


def my_callback(async_obj):
    try:
        print '-------------------------'
        children = async_obj.get()
        do_something(children)
    except (ConnectionLossException, NoAuthException):
        sys.exit(1)


zk.create_async("/lws/test/1", b"test")
data = zk.get_async("/lws/test/1")
print data

async_obj = zk.get_children_async("/lws/test/1")
# print async_obj
async_obj.rawlink(my_callback)

# data = zk.exists_async("/lws/test/1")
# print data
class ZookeeperNonBrokerManager:
    zk = None
    default_node_path = None
    config_file_dir = None
    zookeeper_connection_url = None
    zookeeper_connection_timeout = None
    # subscriber only
    publisher_server_default_port = None
    publisher_server_url_lst = list()

    def __init__(self, role):
        print("[SETUP/ZK] Communicate NOT through broker")

        # Default zookeeper node path to CRUD
        self.default_node_path = "/publishers"

        # Get config file
        self._get_config_file_addr(role)

        # Get the subset of properties relevant to zookeeper
        self._init_config_props(role)

        # Init zookeeper client instance
        self.zk = KazooClient(self.zookeeper_connection_url)

    # Locate config file
    def _get_config_file_addr(self, role):
        current_dir = path.dirname(path.realpath(__file__))
        parent_dir = path.dirname(path.dirname(current_dir))
        filename = role + ".config"
        self.config_file_dir = path.join(path.dirname(parent_dir), "config",
                                         filename)

    # Get the subset of properties relevant to zookeeper
    def _init_config_props(self, role):
        props = ConfigParser()
        props.read(self.config_file_dir)
        self.zookeeper_connection_url = props["service_discovery"][
            "connection.url"]
        self.zookeeper_connection_timeout = int(
            props["service_discovery"]["connection.timeout.s"])

        if role == "publisher":
            pass
        elif role == "subscriber":
            self.publisher_server_default_port = props["publisher_server"][
                "port"]
        else:
            sys.exit("[ERR] Role not exists")

    def publisher_connect(self, role, node_id, host_ip):
        # Call prevention
        if role != "publisher":
            sys.exit("[ERR] This method can only be called by publisher")

        # Connect to zookeeper server
        if not self.zk.connected:
            self.zk.start(timeout=self.zookeeper_connection_timeout)

        try:
            # Ensure a path, create if necessary
            self.zk.ensure_path(self.default_node_path)

            # Create a node with data
            node = "node" + node_id
            node_path = self.default_node_path + '/' + node
            self.zk.create_async(node_path,
                                 bytes(host_ip, "utf-8"),
                                 ephemeral=True)

        # Exit
        except KeyboardInterrupt:
            self.exit()

    def subscriber_connect(self, role, socks_connect, socks_disconnect):
        # Call prevention
        if role != "subscriber":
            sys.exit("[ERR] This method can only be called by subscriber")

        # Connect to zookeeper server
        if not self.zk.connected:
            self.zk.start(timeout=self.zookeeper_connection_timeout)

        try:
            # Ensure a path, create if necessary
            self.zk.ensure_path(self.default_node_path)

            # Watch on children
            self.watch(socks_connect, socks_disconnect)

        # Exit
        except KeyboardInterrupt:
            self.exit()

    # Watch on the leader node, find new leader if the current leader suicides
    def watch(self, socks_connect, socks_disconnect):
        @self.zk.ChildrenWatch(self.default_node_path)
        def my_func(updated_publisher_server_url_lst):
            if sorted(updated_publisher_server_url_lst) != sorted(
                    self.publisher_server_url_lst):
                # Find publishers to disconeect
                deleted_publisher_servers = [
                    node for node in self.publisher_server_url_lst
                    if node not in updated_publisher_server_url_lst
                ]

                # Find newly join publishers to connect
                new_publisher_servers = [
                    node for node in updated_publisher_server_url_lst
                    if node not in self.publisher_server_url_lst
                ]

                # Update local publisher_server_url_lst
                self.publisher_server_url_lst = updated_publisher_server_url_lst

                if deleted_publisher_servers:
                    print("[SETUP/ZK] Disconnected from " +
                          str(deleted_publisher_servers))
                    for node in deleted_publisher_servers:
                        node_path = self.default_node_path + '/' + node
                        data, _ = self.zk.get(node_path)
                        host_ip = "tcp://{0}:{1}".format(
                            data.decode("utf-8"),
                            self.publisher_server_default_port)
                        print("disconnect from: " + host_ip)
                        socks_disconnect(host_ip)

                if new_publisher_servers:
                    print("[SETUP/ZK] Connected to " +
                          str(new_publisher_servers))
                    for node in new_publisher_servers:
                        node_path = self.default_node_path + '/' + node
                        data, _ = self.zk.get(node_path)
                        host_ip = "tcp://{0}:{1}".format(
                            data.decode("utf-8"),
                            self.publisher_server_default_port)
                        print("connect to: " + host_ip)
                        socks_connect(host_ip)

                self.watch(socks_connect, socks_disconnect)
            else:
                print("[SETUP/ZK] Start watching on leader node")

    def ready(self):
        if not self.config_file_dir:
            sys.exit("[ERR] Doesn't find the config file.")
        elif not self.zookeeper_connection_url:
            sys.exit("[ERR] Zookeeper server url is EMPTY.")
        elif not self.zk:
            sys.exit("[ERR] Zookeeper instance instantiation FAILED.")

        return True

    def exit(self):
        print("[EXIT] Disconnect from zookeeper server.")
        # Alwasys stop the zk instance and disconnect
        self.zk.stop()
        self.zk.close()
        raise KeyboardInterrupt
Example #13
0
class Coordinator(object):
    def __init__(self, zkhosts, root=NODE_HQ_ROOT, alivenode="alive", readonly=False, role=None):
        """zkhosts: a string or a list. list will be ','.join-ed into a string.
        root: root node path (any parents must exist, if any)
        """
        self.LOGGER = logging.getLogger("hq.zkcoord")

        if not isinstance(zkhosts, basestring):
            zkhosts = ",".join(zkhosts)
        self.zkhosts = zkhosts
        self.ROOT = root
        self.alivenode = alivenode
        self.readonly = readonly
        self.nodename = os.uname()[1]

        self.NODE_SERVERS = self.ROOT + "/servers"
        self.NODE_ME = self.NODE_SERVERS + "/" + self.nodename
        self.NODE_MYJOBS = self.NODE_ME + "/jobs"
        self.NODE_GJOBS = self.ROOT + "/jobs"

        self.__listeners = {}

        self.jobs = {}

        self.zh = None
        self.zstate = None

        self._connect()

    def _connect(self):
        try:
            if self.zh:
                self.zh.stop()
            self.LOGGER.debug("connecting to %s", self.zkhosts)
            self.zh = KazooClient(hosts=self.zkhosts)
            self.zh.add_listener(self.__watcher)
            # this will wait until connection is established.
            self.zh.start()
        except ZookeeperError as ex:
            self.zh = None
            self.zkerror = ex

    def _initialize(self):
        if not self.readonly:
            if self.zstate is None:
                self.zh.ensure_path(self.NODE_SERVERS)
                self.zh.ensure_path(self.NODE_GJOBS)
            self.publish_alive()
            if not self.zh.exists(self.NODE_ME):
                self.zh.create(self.NODE_ME, "")
            if not self.zh.exists(self.NODE_MYJOBS):
                self.zh.acreate(self.NODE_MYJOBS)
        # setup notifications
        self.zh.get_children(self.NODE_SERVERS, self.__servers_watcher)
        self.zh.get_children(self.NODE_GJOBS, self.__jobs_watcher)

    def __watcher(self, state):
        # client level callback method. this method should not block.
        if state == KazooState.LOST:
            # session expiration
            self.zstate = state
        elif state == KazooState.SUSPENDED:
            # disconnected, session is still alive
            self.zstate = state
        else:
            # (re)connected
            self.LOGGER.debug("connected")
            self.zh.handler.spawn(self._initialize)
            self.zstate = state

    def get_status_text(self):
        return self.zkerror

    def create(self, path, data=""):  # , perm=PERM_WORLD, flags=''):
        return self.zh.create(path, data)

    def acreate(self, path, data=""):  # , perm=PERM_WORLD, flags=''):
        return self.zh.acreate(path, data)

    def exists(self, path):
        return self.zh.exists(path)

    def delete(self, path):
        try:
            return self.zh.delete(path)
        except NoNodeError as ex:
            pass

    def get_children(self, path, watch=None):
        return self.zh.get_children(path, watch=watch)

    def __servers_watcher(self, zh, evtype, state, path):
        """called when HQ servers are added / dropped."""
        try:
            ch = self.get_children(self.NODE_SERVERS, watch=self.__servers_watcher)
            self.LOGGER.info("servers added/removed:%s", str(ch))
            self.fire_event("serverschanged")
        except ZookeeperError as ex:
            self.LOGGER.warn("zk.get_children(%r) failed", self.NODE_SERVERS, exc_info=1)

    def __jobs_watcher(self, zh, evtype, state, path):
        """called when jobs are added / dropped."""
        try:
            self.LOGGER.info("%s children changed", self.NODE_GJOBS)
            ch = self.get_children(self.NODE_GJOBS, watch=self.__jobs_watcher)
            self.fire_event("jobschanged")
        except ZooKeeperError as ex:
            self.LOGGER.warn("get_children(%r) failed", self.NODE_GJOBS, exc_info=1)

    def publish_alive(self):
        node_alive = self.NODE_ME + "/" + self.alivenode
        self.zh.create_async(node_alive, ephemeral=True)

    def publish_job(self, job):
        """job: hq.CrawlJob"""
        ju = self.jobs.get(job)
        # update 10 minutes interval
        if ju is None or ju < time.time() - 10 * 60:
            NODE_MYJOB = self.NODE_MYJOBS + "/" + job.jobname

            def set_complete(a):
                # print >>sys.stderr, "aset completed: %s" % str(args)
                if a.exception == NoNodeError:
                    # node does not exist yet - create anew
                    self.zh.create_async(NODE_MYJOB, "")

            try:
                a = self.zh.set_async(NODE_MYJOB, "")
                a.rawlink(set_complete)
            except:
                self.LOGGER.warn("aset failed", exc_info=1)
                pass

            node2 = self.NODE_GJOBS + "/" + job.jobname
            self.zh.create_async(node2)
            self.zh.create_async("{}/{}/{}".format(self.NODE_GJOBS, job.jobname, self.nodename), ephemeral=True)
            self.jobs[job] = time.time()

    def publish_client(self, job, client):
        pass

    def get_servers(self):
        return self.zh.get_children(self.NODE_SERVERS)

    def get_server_job(self, server, job):
        p = self.NODE_SERVERS + "/" + server + "/jobs/" + job
        j = dict()
        try:
            nodeval = self.zh.get(p)
            attr = nodeval[1]
            j["ts"] = attr.mtime / 1000.0
        except NoNodeError as ex:
            j["ts"] = 0
        return j

    def get_status_of(self, server=None, jobs=None):
        if self.zh is None:
            return None
        server = server or self.nodename
        status = dict(name=server)
        try:
            node = self.zh.get(self.NODE_SERVERS + "/" + server + "/alive")
            status["alive"] = node[1]
        except NoNodeError as ex:
            status["alive"] = False

        jobspath = self.NODE_SERVERS + "/" + server + "/jobs"
        if jobs is None:
            try:
                jobs = self.get_children(jobspath)
            except NoNodeError:
                jobs = []
        elif isinstance(jobs, basestring):
            jobs = [jobs]
        status["jobs"] = []
        for j in jobs:
            jobj = self.get_server_job(server, j)
            jobj["name"] = j
            status["jobs"].append(jobj)
        return status

    def get_servers_status(self):
        return [self.get_status_of(server) for server in self.get_servers()]

    def get_job_servers(self, jobname):
        """return a map of integer identifier to server name, which
        is configured for the job jobname.
        """
        p = self.NODE_GJOBS + "/" + jobname + "/servers"
        try:
            svids = self.get_children(p)
        except NoNodeError as ex:
            return {}

        servers = {}
        for name in svids:
            try:
                svid = int(name)
            except:
                continue
            nodevals = self.zh.get(p + "/" + str(svid))
            if nodevals[0]:
                servers[svid] = nodevals[0]
        return servers

    def get_job_servers2(self, jobname):
        """returns servers for job "jobname", including those active
        but not registered at jobs/JOBNAME/servers. elements of returned
        list are dict with svid and name keys. svid key only exists for
        those registered for the "jobname".
        """
        servers = [dict(svid=svid, name=name) for svid, name in self.get_job_servers(jobname).items()]
        regservers = set(s["name"] for s in servers)

        try:
            p = self.NODE_SERVERS
            ss = self.get_children(self.NODE_SERVERS)
            for s in ss:
                if s in regservers:
                    continue
                if not self.is_server_alive(s):
                    continue
                p = self.NODE_SERVERS + "/" + s + "/jobs/" + jobname
                if self.exists(p):
                    servers.append(dict(name=s))
        except ZookeeperError as ex:
            self.LOGGER.debug("zookeeper access failed", exc_info=1)

        return servers

    def add_job_server(self, job, server):
        pass

    def delete_job_server(self, job, server):
        jobservers = dict((v, k) for k, v in self.get_job_servers(job).items())
        if server in jobservers:
            p = self.NODE_GJOBS + "/" + job + "/servers/" + jobservers[server]
            self.delete(p)

        p = self.NODE_SERVERS + "/" + server + "/jobs/" + job
        # assumption: there's no child under the server/job node.
        try:
            self.delete(p)
        except NoNodeError as ex:
            pass
        except NotEmptyException as ex:
            # XXX
            pass

    def is_server_alive(self, server):
        p = self.NODE_SERVERS + "/" + server + "/alive"
        return self.exists(p)

    def add_listener(self, ev, listener):
        if not isinstance(ev, basestring):
            raise ValueError, "ev must be a string"
        ll = self.__listeners.get(ev)
        if not ll:
            self.__listeners[ev] = set((listener,))
        else:
            ll.add(listener)

    def remove_listener(self, ev, listener):
        if not isinstance(ev, basestring):
            raise ValueError, "ev must be a string"
        ll = self.__listeners.get(ev)
        if ll:
            try:
                ll.remove(listener)
            except KeyError:
                pass

    def fire_event(self, ev, *args):
        ll = self.__listeners.get(ev)
        if ll:
            for listener in ll:
                try:
                    listener(*args)
                except:
                    self.LOGGER.warn("error running listener %r " "with ev=%r, args=%r", listener, ev, args, exc_info=1)

    def shutdown(self):
        if self.zh:
            self.zh.stop()
            self.zh = None
Example #14
0
from kazoo.handlers.gevent import SequentialGeventHandler

logging.basicConfig()

zk = KazooClient(hosts='zoo1:2181', handler=SequentialGeventHandler())
event = zk.start_async()
event.wait(timeout=30)
if not zk.connected:
    # Not connected, stop trying to connect
    print("ummm")
    zk.stop()
    raise Exception("Unable to connect.")


def my_callback(async_obj):
    try:
        children = async_obj.get()
        print("hi")
    except (ConnectionLossException, NoAuthException):
        sys.exit(1)


# Both these statements return immediately, the second sets a callback
# that will be run when get_children_async has its return value
zk.create_async("/master", b'mas')
zk.create_async("/master/node_1", b'c1')
print("gdgd")
async_obj = zk.get_children_async("/master")
async_obj.rawlink(my_callback)
zk.create_async("/master/node_2", b'c2')