Esempio n. 1
0
class HHFrontierWorker(FrontierWorker):
    def __init__(self, settings, no_batches, no_scoring, no_incoming):
        super(HHFrontierWorker, self).__init__(settings, no_batches, no_scoring, no_incoming)
        self.init_zookeeper()

    def init_zookeeper(self):
        self._zk = KazooClient(hosts=settings.get('ZOOKEEPER_LOCATION'))
        self._zk.add_listener(self.zookeeper_listener)
        self._zk.start()
        self.znode_path = self._zk.create("/frontera/hh-f-worker", ephemeral=True, sequence=True, makepath=True)

    def zookeeper_listener(self, state):
        if state == KazooState.LOST:
            # Register somewhere that the session was lost
            pass
        elif state == KazooState.SUSPENDED:
            # Handle being disconnected from Zookeeper
            pass
        else:
            # Handle being connected/reconnected to Zookeeper
            pass

    def set_process_info(self, process_info):
        self.process_info = process_info
        self._zk.set(self.znode_path, self.process_info)

    def set_job_id(self, job_id):
        self._backend.set_job_id(job_id)
        self.job_id = job_id
Esempio n. 2
0
def start():
  global zk

  zk = KazooClient()

  if shell.config['barrier'] is True:
    path_barrier = '/'+shell.config['identity']+'/barrier'
    value_barrier = json.dumps({'NodeId':shell.config['nodeid']}, encoding='utf-8')

    @zk.DataWatch(path_barrier)
    def watch_node(data, stat, event):
      global flag
      if event:
        logging.info("Node Event %s %s, data %s" %(event.path, event.type, data))
        if event.type == EventType.DELETED:
          flag[0] = True
          if flag[1]:
            zk.handler.spawn(create_ephemeral)
          else:
            flag[1] = True


  zk.add_listener(my_listener)

  try:
    zk.start()
  except Exception as e:
    logging.error(e)
    sys.exit(1)
def connect_to_zk():
    zookeeper_connect_string = os.getenv('ZOOKEEPER_CONN_STRING')
    zk = KazooClient(hosts=zookeeper_connect_string)
    zk.start()
    zk.add_listener(state_listener)
    logging.info("connected to Zookeeper")
    return zk
Esempio n. 4
0
class Exhibitor:

    def __init__(self, exhibitor, chroot):
        self.chroot = chroot
        self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=30)
        self.client = KazooClient(hosts=self.exhibitor.zookeeper_hosts + self.chroot,
                                  command_retry={
                                      'deadline': 10,
                                      'max_delay': 1,
                                      'max_tries': -1},
                                  connection_retry={'max_delay': 1, 'max_tries': -1})
        self.client.add_listener(self.session_listener)
        self.client.start()

    def session_listener(self, state):
        pass

    def _poll_exhibitor(self):
        if self.exhibitor.poll():
            self.client.set_hosts(self.exhibitor.zookeeper_hosts + self.chroot)

    def get(self, *params):
        self._poll_exhibitor()
        return self.client.retry(self.client.get, *params)

    def get_children(self, *params):
        self._poll_exhibitor()
        try:
            return self.client.retry(self.client.get_children, *params)
        except NoNodeError:
            return []
Esempio n. 5
0
class ZookeeperSession(object):

    def __init__(self, locations, name_prefix, root_prefix='/frontera'):
        self._zk = KazooClient(hosts=locations)
        self._zk.add_listener(self.zookeeper_listener)
        self._zk.start()
        self.root_prefix = root_prefix
        self.znode_path = self._zk.create("%s/%s" % (self.root_prefix, name_prefix),
                                          ephemeral=True,
                                          sequence=True,
                                          makepath=True)

    def zookeeper_listener(self, state):
        if state == KazooState.LOST:
            # Register somewhere that the session was lost
            pass
        elif state == KazooState.SUSPENDED:
            # Handle being disconnected from Zookeeper
            pass
        else:
            # Handle being connected/reconnected to Zookeeper
            pass

    def set(self, value):
        self._zk.set(self.znode_path, value)

    def get_workers(self, prefix='', exclude_prefix=''):
        for znode_name in self._zk.get_children(self.root_prefix):
            if prefix and not znode_name.startswith(prefix):
                continue
            if exclude_prefix and znode_name.startswith(exclude_prefix):
                continue
            location, _ = self._zk.get(self.root_prefix+"/"+znode_name)
            yield location
class ZKStore:
    def __init__(self, hosts):
        self.zk = KazooClient(hosts=hosts)
        self.zk.add_listener(listener)
        self.zk.start()


    def isConnected(self):
        if __state__ == 1:
            return True
        return False


    def write(self, path, node, value):
        self.zk.ensure_path(path)
        if self.zk.exists(path+"/"+node):
           self.zk.set(path+"/"+node, value)
        else:
           self.zk.create(path + "/" + node, value)


    def read(self, path):
        if self.zk.exists(path):
            data, stat = self.zk.get(path)
            return data
        return None
Esempio n. 7
0
def start(servers=None, force_reconnect=False):
    global zk
    log.warn("start zookeeper current:%s" % zk)
    if zk and not force_reconnect:
        return
    server_list = []
    if not servers:
        try:
            with open("/opt/cluster/zookeeper_addresses.json") as f:
                info = json.loads(f.read())
                for s in info.get("cluster"):
                    server_list.append("%s:2181" % s.split(":")[0])
        except:
            pass
        if len(server_list) == 0:
            server_list.append("127.0.0.1:2181")

        servers = str(",".join(server_list))
    else:
        server_list = servers.split(",")

    def pick_server():
        for t in xrange(len(server_list)):
            server = server_list[random.randint(0, len(server_list) - 1)]
            try:
                host, port = server.split(":")
            except:
                host, port = server, 2181
            try:
                log.warn("checking service %s:%s" % (host, port))
                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                s.settimeout(3)
                s.connect((str(host), int(port)))
                s.close()
                log.warn("checking service %s:%s success" % (host, port))
                return server
            except:
                log.warn("checking service %s:%s failed" % (host, port))
                continue

    log.warn("Zookeeper connect to: %s connect" % (servers))
    logger = logging.getLogger("kazoo")
    logger.setLevel(logging.WARNING)
    zk = KazooClient(hosts=servers, timeout=ztimeout, logger=logger)
    zk.add_listener(connection_state)
    zk.start(timeout=ztimeout)  # zk *1000 ile carpiyor, carpmamali.
    log.warn("connected")

    wait_time = 0
    while True:
        log.warn("wait %s" % current_state)
        if current_state == "CONNECTED":
            log.warn("return")
            break
        if wait_time > 10:
            log.warn("wait too much killing myself")
            os.kill(os.getpid(), signal.SIGKILL)
            os._exit(1)
        time.sleep(1)
        wait_time += 1
Esempio n. 8
0
    def achieve_consensus(self):
        """Trigger consensus logic and handle errors."""

        log.info('Set up ZK client using host(s): %s', self._hosts)
        zk = KazooClient(hosts=self._hosts)

        # Initialize ZK connection state variable, which is shared across
        # threads. It is updated from a change listener function which is
        # invoked from within a Kazoo connection management thread, see
        # http://kazoo.readthedocs.org/en/latest/api/handlers/threading.html.
        self._connected = False
        zk.add_listener(self._zk_state_change_listener)
        zk.start()

        # Wait for handling thread to update connection status. (As of non-
        # determinism around GIL context switches there is otherwise no
        # guarantee that the status is updated within
        # `_run_consensus_procedure`).
        while not self._connected:
            time.sleep(0.01)

        self._zk = zk
        try:
            # This may raise ConnectionLost or various
            # kazoo.exceptions.* types.
            return self._run_consensus_procedure()
        finally:
            log.info('Shut down ZK client.')
            try:
                zk.stop()
            finally:
                zk.close()
Esempio n. 9
0
def consumer_group(request, cluster_id, group_id):	
	cluster = get_cluster_or_404(id=cluster_id)
	zk = KazooClient(hosts=cluster['zk_host_ports'])
	zk.add_listener(my_listener)
	zk.start()
	consumer_group = _get_consumer_group(zk=zk,cluster=cluster,group_id=group_id)
	zk.stop()
	return render('consumer_group.mako', request, {'cluster': cluster, 'consumer_group':consumer_group})
Esempio n. 10
0
 def connect_to_zookeepr(self):
     """
     Connect to zookeeper
     """
     zk = KazooClient(hosts=self.zookeeper_urls, read_only=True)
     zk.start()
     zk.add_listener(self.my_listener)
     return zk
Esempio n. 11
0
class SentinelDaemon(object):
    def __init__(self, port):
        """
        Read config and spawn child processes.
        :type port: int
        """
        self._log = logging.getLogger('sent.daemon')
        self._log.info('Creating Sentinel')

        self._port = port
        self.children = dict()
        self._settings = None
        self._system = get_system()
        self._hostname = socket.getfqdn()
        self._prev_state = None
        self.listener_lock = Lock()
        self.version = get_version()
        self.task_client = None

        self.zkclient = KazooClient(hosts=get_zk_conn_string(),
                                    timeout=60.0,
                                    handler=SequentialThreadingHandler(),
                                    logger=logging.getLogger('kazoo.daemon'))

        self.zkclient.add_listener(self._zk_listener)
        # this will run self._reset_after_connection_loss
        self.zkclient.start()
        while not self._settings:
            self._log.info('Waiting for settings.')
            time.sleep(1)

        self._tmp_dir = os.path.join(self._settings.get('zookeeper').get('temp_directory', '/'), 'ruok')

        self.task_client = ZKTaskClient(self.children,
                                        self.zkclient,
                                        self._settings.get('zookeeper', {}).get('task'))

        self._rest_server = tornado.httpserver.HTTPServer(RestServer(self.children,
                                                                     self.version,
                                                                     self._tmp_dir,
                                                                     self._hostname,
                                                                     self.zkclient))

        signal.signal(signal.SIGINT, self._handle_sigint)
        signal.signal(signal.SIGTERM, self._handle_sigint)
        self._log.info('Created Sentinel')

    def __enter__(self):
        logging.info('Starting Sentinel, listening on port {}'.format(self._port))
        if platform.system() == 'Linux':
            import resource
            try:
                resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))
                logging.info('Set RLIMIT_CORE to unlimited. Core files can be generated by sentinel on this system.')
            except ValueError, ve:
                logging.info('Invalid resource limit specified. Core files will not be generated for apps: {0}'.format(ve))
            except AttributeError, ae:
                logging.info('AttributeError experienced: {0}'.format(ae))
Esempio n. 12
0
class dzk:
    def __init__(self,hosts,secs):
        self.hosts = hosts
        #self.zk = KazooClient(hosts='1.1.1.3:2181,1.1.1.2:2181,1.1.1.1:2181',retry_max_delay=2000)
        self.zk = KazooClient(hosts=self.hosts)
        try:
            self.zk.start()
            self.zk.add_listener(self.listener)
        except Exception,e:
            print "ERROR connect LOST ==============>"
Esempio n. 13
0
def _get_consumers(cluster):
	zk = KazooClient(hosts=cluster['zk_host_ports'])
	zk.add_listener(my_listener)
	zk.start()
	groups = _get_consumer_groups(zk,cluster['id'])
	consumer_groups = []
	for group in groups:
		consumer_groups.append(_get_consumer_group(zk=zk,cluster=cluster,group_id=group))
	zk.stop()
	return consumer_groups
Esempio n. 14
0
class dzk:
    def __init__(self):
        self.BasePath = "/my/"
        self.zk = KazooClient(hosts='x.24.79.51:2181,x.24.79.53:2181',retry_max_delay=2000)
        self.zk.start()
        self.zk.add_listener(self.listener)

    def listener(state):
        if state == KazooState.LOST:
            self.zk.start()
        elif state == KazooState.SUSPENDED:
            print "*******listener saw KazooState.LOST"
        else:
            print "*******listener saw KazooState.CONNECT"

    def getIpHost(self):
        self.myname  = socket.getfqdn(socket.gethostname())
        myip = socket.gethostbyname(self.myname)
        return  myip

    def register(self):
        ip = self.getIpHost()
        if ip:
            NODE = self.BasePath + ip
            print "register:",NODE
        else:
            print "[ERROR:] %s does not exist " %(NODE)
            sys.exit(2)

        if not self.zk.exists(NODE): 
            self.zk.ensure_path(NODE)

    def getData(self):
        ip = self.getIpHost()
        if ip:
            NODE = self.BasePath + ip
        else:
            print "[ERROR:] %s does not exist " %(NODE) 

        if self.zk.exists(NODE):
            data, stat = self.zk.get(NODE)
            print("Version: %s, data: %s" % (stat.version, data.decode("utf-8")))

    def monitor(self):
        pass
    
    def heartbeat(self):
        pass

    def role(self):
        pass
    
    def command(self):
        pass
Esempio n. 15
0
def get_zk(zkhosts, timeout, command_retry=None, connection_retry=None):
    '''
    Initiate a zookeeper connection and add a listener
    '''
    conn = KazooClient(hosts=zkhosts, timeout=timeout, command_retry=command_retry, connection_retry=connection_retry)
    conn.add_listener(listener)
    try:
        conn.start()
    except KazooTimeoutError as exc:
        log.error(exc)
        sys.exit(1)
    return conn
Esempio n. 16
0
def main():
    zk = KazooClient(hosts="127.0.0.1:2181", timeout=2.0)
    zk.add_listener(my_listener)
    zk.start()

    if zk.exists("/ELECTION") == None:
        zk.ensure_path("/ELECTION")

    c = 1
    node_pathes = []
    while c < 10:
        c += 1
        node_path = zk.create("/ELECTION/guid-n_", b"a value", ephemeral=True, sequence=True)
        node_pathes.append(node_path)

    my_path = random.choice(node_pathes)
    my_path = my_path.replace("/ELECTION/", "")
    # print "print my_path", my_path

    children = zk.get_children("/ELECTION/", watch=election_child_watcher)
    get_next_path = False

    prev_path = None
    for child_path in sorted(children):
        if child_path == my_path:
            break
        prev_path = child_path

        # I'm the leader
    if prev_path == None:
        print "OK I'm leader don't have to watch"
        return

        # fires twice, once on creation ignore

    @zk.DataWatch("/ELECTION/" + prev_path)
    def watch_node(data, stat):
        # only watch for first change
        if stat.version == 1:
            print ("Version: %s, data: %s" % (stat.version, data.decode("utf-8")))
            print "setting watch on " + prev_path
            print "my", my_path

    zk.set("/ELECTION/" + prev_path, b"some data")

    print "boom. watch triggered?"
    # time.sleep(10)
    print "bye"

    zk.stop()
Esempio n. 17
0
    def connect(self):
        """Initialize a connection to the Zookeeper quorum.

        :return: Kazoo client object as connection.
        """
        client_kwargs = dict(
            hosts=self.app.config['KAZOO_HOSTS'],
            timeout=self.app.config['KAZOO_SESSION_TIMEOUT'],
            connection_retry=self.app.config['KAZOO_RETRY'],
            command_retry=self.app.config['KAZOO_RETRY']
        )
        # is ACL ?
        username = self.app.config.get('KAZOO_ACL_USERNAME', None)
        password = self.app.config.get('KAZOO_ACL_PASSWORD', None)

        if username and password:
            client_kwargs.update(dict(
                default_acl=[
                    make_digest_acl(
                        username=username,
                        password=password,
                        read=self.app.config.get(
                            'KAZOO_ACL_READ', False
                        ),
                        write=self.app.config.get(
                            'KAZOO_ACL_WRITE', False
                        ),
                        create=self.app.config.get(
                            'KAZOO_ACL_CREATE', False
                        ),
                        delete=self.app.config.get(
                            'KAZOO_ACL_DELETE', False
                        ),
                        admin=self.app.config.get(
                            'KAZOO_ACL_ADMIN', False
                        ),
                        all=self.app.config.get(
                            'KAZOO_ACL_ALL', False
                        )

                    )
                ],
                auth_data=[("digest", ":".join((username, password)))],
            ))

        client = KazooClient(**client_kwargs)
        client.start(timeout=self.app.config['KAZOO_START_TIMEOUT'])
        client.add_listener(self.connection_state_listener)
        return client
Esempio n. 18
0
class ZookeeperSession(BaseClient):
  conext_manager = ZookeeperResponseContextManager
  loose_policy = {}
  strict_policy = {}

  def __init__(self,server_list='127.0.0.1:2181',*args,**kwargs):
    super(ZookeeperSession,self).__init__(*args,**kwargs)
    self.session_policy = "loose_policy"
    self._zookeeper_client = None 
    self.server_list = server_list

  def set_session_policy(self,session_policy="loose"):
    '''prototype not currenlty used.
    '''
    self.session_policy = session_policy+"_policy"

  def connect(self,*args,**kwargs):
    '''See http://kazoo.readthedocs.org/en/latest/api/client.html
     for details regarding available options. Any provided client
     start() parameters provided will override defaults.
    '''
    defaults = {
      "hosts" : self.server_list,
      "handler" : SequentialGeventHandler()
    }
    defaults.update(getattr(self,self.session_policy))
    defaults.update(kwargs)
    self._state = KazooState.LOST
    self._zookeeper_client = KazooClient(**defaults)
    self._zookeeper_client.add_listener(self._state_tracker)
    watchable = self._zookeeper_client.start_async()
    watchable.wait(30)
    if not self._zookeeper_client.connected:
      err = "Could not connect to Zookeeper server(s) %(server_list)s" % defaults
      raise ResponseError(err)

  @require_state(KazooState.CONNECTED)
  @record_stats
  def ensure_path(self,path,watcher=None):
    self._zookeeper_client.ensure_path(path,watcher)
    
  def _state_tracker(self,state):
    self._state = state

  def __del__(self):
    if isinstance(self._zookeeper_client, KazooClient):
      self._zookeeper_client.stop() 
Esempio n. 19
0
class NodeMonitor:
    STATIC_NODE_ID = 0
    global t
    def __init__(self):
        self.zk = None
        self.SERVER_IP_AND_PORT = "localhost:2181"
        self.NODE_ID = str(NodeMonitor.STATIC_NODE_ID)
        NodeMonitor.STATIC_NODE_ID += 1 
    
    def start_zk(self):
        self.zk = KazooClient(hosts=self.SERVER_IP_AND_PORT)
        self.zk.add_listener(self._connection_listener)
        self.zk.start();
        
        self.zk.ensure_path("/monitorData/"+ self.NODE_ID)
    
    def start_update_info(self):
        t = threading.Timer(0.0, self._update_info)
        t.start()
        
    
    def _update_info_once(self):
        cmi = CollectMachineInfo()
        async_obj = self.zk.set_async("/monitorData/"+ self.NODE_ID, (cmi.collectInfo()).encode(encoding="utf-8"))
        async_obj.rawlink(self._update_info_callback)
    
        
    def _connection_listener(self, state):
        if state == KazooState.LOST:
            print "connection lost, going to connect again"
            self.start_zk();
        elif state == KazooState.SUSPENDED:
            print "suspended"
        else:
            print "connected ok"
    
    def _update_info_callback(self, async_obj):
        try:
            print "update success"
        except (ConnectionLossException, NoAuthException):
            print "exception!"
    
    def _update_info(self):
        print "begin to update"
        self._update_info_once()
        t = threading.Timer(5.0, self._update_info)
        t.start()
Esempio n. 20
0
    def __setupMetaServerConnection(self):
        keeperHosts = config.get('server', 'meta')
        parsed = urlparse(keeperHosts)

        if parsed.scheme != 'zk':
            raise ValueError("Meta URL must start with zk://")

        if parsed.path in ('/', ''):
            raise ValueError("Service root path not found.")

        self.rootpath = parsed.path.rstrip('/')

        # NOTE: currently, auth_data is not supported
        servers = parsed.netloc.split('@')[-1]
        metaClient = KazooClient(hosts=servers, handler=SequentialGeventHandler())
        metaClient.add_listener(self.__connection)

        return metaClient
def run():
    replication_factor = 3
    zookeeper_connect_string = os.getenv('ZOOKEEPER_CONN_STRING')
    logging.info("waiting for kafka to start up")
    if os.getenv('WAIT_FOR_KAFKA') != 'no':
        wait_for_kafka_startup.run(get_own_ip())
    else:
        sleep(10)

    logging.info("kafka port is open, continuing")

    zk = KazooClient(hosts=zookeeper_connect_string)
    zk.start()
    zk.add_listener(state_listener)

    logging.info("connected to Zookeeper")

    zk_dict = get_zk_dict(zk)
    result = generate_json(zk_dict, replication_factor, broken_topics=True)
    if result != {}:
        logging.info("JSON generated")
        logging.info("there are " + str(len(result['partitions'])) + " partitions to repair")
        logging.debug(result)
        if os.getenv('WRITE_TO_JSON') != 'no':
            write_json_to_zk(zk, result)
    else:
        logging.info("no JSON generated")
        needed = True
        for broker in zk_dict['broker']:
            if int(get_broker_weight(zk_dict, {'partitions': []}, broker)) == 0:
                needed = True
        if needed is True:
            result = generate_json(zk_dict, replication_factor, broken_topics=False)
            if result != {}:

                logging.info("JSON generated")
                if os.getenv('WRITE_TO_JSON') != 'no':
                    write_json_to_zk(zk, result)
        else:
            logging.info("no unused Broker found")

    zk.stop()
    logging.info("exiting")
Esempio n. 22
0
def _get_cluster_topology(cluster):
	zk = KazooClient(hosts=cluster['zk_host_ports'])
	zk.add_listener(my_listener)
	zk.start()
	brokers = _get_brokers(zk,cluster['id'])
	consumer_groups = _get_consumer_groups(zk,cluster['id'])
	consumer_groups_status = {} # 0 = offline, (not 0) =  online
	for consumer_group in consumer_groups:
		consumers_path = cluster['consumers_path'] + "/" + consumer_group + "/ids"
		try:
			consumers = zk.get_children(consumers_path)
		except NoNodeError:
			consumer_groups_status[consumer_group]=0 # 0 = offline
		else:
			consumer_groups_status[consumer_group]=len(consumers) # (not 0) =  online

	cluster_topology = {'cluster':cluster,'brokers':brokers,'consumer_groups':consumer_groups, 'consumer_groups_status':consumer_groups_status}
	zk.stop()
	return cluster_topology
class ZkServiceRegister:
    def __init__(self, zk_address, zk_timeout):
        self.__zkClient = KazooClient(hosts=zk_address, timeout=zk_timeout, read_only=False)
        self.__zkListener = ZkServiceRegisterListener(self.__zkClient)
        self.__zkClient.add_listener(self.__zkListener)
        self.__zkClient.start()

    def register(self, path, host, port, weight=DEFAULT_HOST_WEIGHT):
        try:
            if not self.__zkClient.exists(path):
                self.__zkClient.ensure_path(path)
        except Exception, e:
            print e.message

        reg_path = path + '/' + host + ':' + str(port) + ':' + str(weight)
        if self.__zkClient.exists(reg_path):
            self.__zkClient.delete(reg_path)

        self.__zkClient.create(reg_path, value='', ephemeral=True)
Esempio n. 24
0
class ConfigFlags(StorageBase):
    """ A configuration manager using ZooKeeper. For setting flags
    on all instances for a given product. This will default to
    using a locally stored cache if ZooKeeper fails to respond.

    Note: current live flags are limited to 1MB total.

    Set initial values in config.ini file as
        flags.foo = bar
    to set "foo" flag to value bar.

    """

    #TODO:
    # * Break flags into separate elements instead of single JSON?
    # * Add fake dict to hold settings if ZK not installed/avaliable.
    localFlags = {}
    version = None

    def __init__(self, config, **kw):
        try:
            if 'Configurator' in type(config).__name__:
                config = config.get_settings()
            conf = config.get('flags.zk.settings')
            if conf is not None:
                conf = dict(json.loads(conf))
                self.zk = KazooClient(conf)
            else:
                self.zk = KazooClient()
            # get a copy of the local flags.
            self.zk_path = config.get('flags.zk.path',
                                      '/general/config')
            self.zk.start()
            node = self.zk.exists(self.zk_path)
            if node is None:
                # Virgin install, set from the config values.
                self._init_zk(config)
            self.zk.add_listener(self._zk_listener)
            self._refreshCache(config=config)

        except Exception, e:
            warnings.warn("Could not connect to ZooKeeper %s" % repr(e))
Esempio n. 25
0
    def init_app(self, app):
        """
        Read kazoo settings from app configuration,
        setup kazoo client for application

        :param app: Flask application instance.

        """
        app.config.setdefault('KAZOO_HOSTS', '127.0.0.1:2181')
        app.config.setdefault('KAZOO_START_TIMEOUT', 3)
        app.config.setdefault('KAZOO_START_BLOCKING', False)

        app.config.setdefault('KAZOO_SESSION_TIMEOUT', 10.0)  # kazoo default

        app.config.setdefault('KAZOO_DEFAULT_RETRY', True)
        app.config.setdefault('KAZOO_RETRY_MAX_DELAY_SECONDS', 60 * 60)  # kazoo default of 1hr.

        # Put cqlengine to application extensions
        if not 'kazoo' in app.extensions:
            app.extensions['kazoo'] = {}

        # Initialize connection and store it to extensions
        if app.config['KAZOO_DEFAULT_RETRY']:
            retry_kwargs = {
                'max_delay': app.config['KAZOO_RETRY_MAX_DELAY_SECONDS']
            }
        else:
            retry_kwargs = None

        kazoo_client = KazooClient(hosts=app.config['KAZOO_HOSTS'],
                                   timeout=app.config['KAZOO_SESSION_TIMEOUT'],
                                   connection_retry=retry_kwargs,
                                   command_retry=retry_kwargs)

        if app.config['KAZOO_START_BLOCKING']:
            kazoo_client.start(app.config['KAZOO_START_TIMEOUT'])
        else:
            kazoo_client.start_async()

        kazoo_client.add_listener(self.connection_state_listener)

        app.extensions['kazoo']['client'] = kazoo_client
Esempio n. 26
0
class TestSessions(unittest.TestCase):

    def setUp(self):
        from kazoo.client import KazooClient
        from kazoo.protocol.states import KazooState
        from kazoo.testing.common import ZookeeperCluster
        ZK_HOME = os.environ.get("ZOOKEEPER_PATH")
        ZK_CLASSPATH = os.environ.get("ZOOKEEPER_CLASSPATH")
        self.cluster = ZookeeperCluster(ZK_HOME, size=1, port_offset=21000, classpath=ZK_CLASSPATH)
        self.cluster.start()
        atexit.register(lambda cluster: self.cluster.terminate(), self.cluster)
        self.client = KazooClient(self.cluster[0].address, max_retries=5)
        self.ev = threading.Event()

        def back(state):
            if state == KazooState.CONNECTED:
                self.ev.set()
        self.client.start()
        self.path = self.client.create(uuid.uuid4().hex)
        self.client.add_listener(back)

    def test_restarted_server(self):
        raise SkipTest('Patch missing')
        self.cluster.stop()
        self.cluster.start()
        self.ev.wait(5)
        eq_(self.ev.is_set(), True)
        self.assertTrue(self.client.retry(self.client.exists, self.path))

    def test_terminated_server(self):
        raise SkipTest('Patch missing')
        self.cluster.reset()
        self.cluster.start()
        self.ev.wait(5)
        eq_(self.ev.is_set(), True)
        self.assertFalse(self.client.retry(self.client.exists, self.path))

    def tearDown(self):
        self.ev.clear()
        self.client.stop()
        self.cluster.stop()
class ZkServiceProvider:
    def __init__(self, zk_address, zk_timeout, connection):
        self.__service_dict = {}
        self.__zk_address = zk_address
        self.__zkClient = KazooClient(hosts=zk_address, timeout=zk_timeout, read_only=True)
        self.__zkClient.start()
        self.__zkListener = ZkServiceProviderListener(self.__zkClient)
        self.__zkClient.add_listener(self.__zkListener)
        self.__connection = connection

    def register_service(self, service, zk_path, client_cls):
        self.__service_dict[service] = (zk_path, client_cls)
        result = self._register_watcher(service, zk_path, client_cls)
        return result

    def _register_watcher(self, service, zk_path, client_cls):
        @self.__zkClient.ChildrenWatch(zk_path)
        def child_changed(data):
            print '+++++++++++++++' + service + ' child changed.++++++++++++++++++'
            print data
            hosts = data
            self.__connection.update_service(service, hosts)

        isExists = self.__zkClient.exists(zk_path)
        if not isExists:
            return False

        try:
            hosts = self.__zkClient.get_children(zk_path)
        except NoNodeError:
            print 'no node for the path of ' + zk_path
            return False
        except:
            print 'other exceptions.'
            return False

        self.__connection.update_service(service, hosts)
        return True

    def stop(self):
        self.__zkClient.stop()
def init():

    global inited
    zk = None
    try:
        zk = KazooClient(hosts='127.0.0.1:2181')
        zk.add_listener(state_listener)
        zk.start()
        register(stop_zk, zk)
        create_path_if_not_exists(zk, '/jobs')
        create_path_if_not_exists(zk, '/watchers')
        create_path_if_not_exists(zk, '/watchlocks')
        create_path_if_not_exists(zk, '/executors')
    except Exception as e:
        print 'Zk problem ', e
        if zk is not None:
            zk.stop()
        sys.exit(1)

    inited = True
    return zk
Esempio n. 29
0
File: zha.py Progetto: bwtakacy/zha
class ClusterMonitor(threading.Thread):
    """periodically checks cluster member.
    This class is delegated to change state between ACT clustered and ACT declustered."""
    def __init__(self, zha):
        threading.Thread.__init__(self)
        self.zha = zha
        self.should_run = True
        self.zk = KazooClient(hosts=self.zha.config.get("connection_string","127.0.0.1:2181"), logger=logger)
        self.zk.add_listener(self._zk_listener)
        self.zk.start()
        self.zroot = self.zha.config.get("cluster_znode","/zha-state")
        self.znode = self.zroot + "/" + self.zha.config.get("id") 
        self._zk_register(first=True)
        self.not_alone = None
    def run(self):
        while self.should_run:
            time.sleep(self.zha.config.get("clustercheck_interval",3))
            self.zha.recheck()
            self._zk_register()
            self.check_cluster()
            self.trigger()
        if self.zha.is_clustered:
            self.zha.config.become_declustered()
            self.zha.is_clustered = False
        self.zk.delete(self.znode)
        logger.info("cluster monitor thread stopped.")
    def check_cluster(self):
        try:
            count = 0
            chs = self.zk.get_children(self.zroot)
            for ch in chs:
                data, stats = self.zk.get(self.zroot+"/"+ch)
                if data.strip()=="SBY:HEALTHY" and ch != self.zha.config.get("id"):
                    count += 1
            if count != 0:
                self.not_alone = time.time()
            logger.debug("healthy sbys: %d"%(count,))
        except Exception,e:
            logger.warn("check cluster failed. Try next time.%s"%e)
Esempio n. 30
0
def _get_topology():
	topology = CLUSTERS.get()
	clusters = []
	for cluster in topology:
		zk = KazooClient(hosts=CLUSTERS[cluster].ZK_HOST_PORTS.get())
		zk.add_listener(my_listener)
		zk.start()
		brokers = _get_brokers(zk,cluster)
		consumer_groups = _get_consumer_groups(zk,cluster)
		consumer_groups_status = {} # 0 = offline, (not 0) =  online
		for consumer_group in consumer_groups:
			consumers_path = CLUSTERS[cluster].CONSUMERS_PATH.get() + "/" + consumer_group + "/ids"
			try:
				consumers = zk.get_children(consumers_path)
			except NoNodeError:
				consumer_groups_status[consumer_group]=0 # 0 = offline
			else:
				consumer_groups_status[consumer_group]=len(consumers) # (not 0) =  online
		c = {'cluster':get_cluster_or_404(id=cluster),'brokers':brokers,'consumer_groups':consumer_groups,'consumer_groups_status':consumer_groups_status}
		clusters.append(c)
		zk.stop()
	return clusters
Esempio n. 31
0
class ZkStateManager(StateManager):
  """
  State manager which connects to zookeeper and
  gets and sets states from there.
  """

  def __init__(self, name, host, port, rootpath, tunnelhost):
    self.name = name
    self.host = host
    self.port = port
    self.tunnelhost = tunnelhost
    self.rootpath = rootpath

  def start(self):
    if self.is_host_port_reachable():
      self.client = KazooClient(self.hostport)
    else:
      localport = self.establish_ssh_tunnel()
      self.client = KazooClient("localhost:" + str(localport))
    self.client.start()

    def on_connection_change(state):
      LOG.info("Connection state changed to: " + state)
    self.client.add_listener(on_connection_change)

  def stop(self):
    self.client.stop()
    self.terminate_ssh_tunnel()

  def get_topologies(self, callback=None):
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
      "result": None
    }
    if callback:
      isWatching = True
    else:
      # Custom callback to get the topologies
      # right now.
      def callback(data):
        ret["result"] = data

    self._get_topologies_with_watch(callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_topologies_with_watch(self, callback, isWatching):
    """
    Helper function to get topologies with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_topologies_path()
    if isWatching:
      LOG.info("Adding children watch for path: " + path)

    @self.client.ChildrenWatch(path)
    def watch_topologies(topologies):
      callback(topologies)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def get_topology(self, topologyName, callback=None):
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
      "result": None
    }
    if callback:
      isWatching = True
    else:
      # Custom callback to get the topologies
      # right now.
      def callback(data):
        ret["result"] = data

    self._get_topology_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_topology_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get pplan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_topology_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    @self.client.DataWatch(path)
    def watch_topology(data, stats):
      if data:
        topology = Topology()
        topology.ParseFromString(data)
        callback(topology)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def create_topology(self, topologyName, topology):
    if not topology or not topology.IsInitialized():
      raise StateException("Topology protobuf not init properly",
                        StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2]

    path = self.get_topology_path(topologyName)
    LOG.info("Adding topology: {0} to path: {1}".format(
      topologyName, path))
    topologyString = topology.SerializeToString()
    try:
      self.client.create(path, value=topologyString, makepath=True)
      return True
    except NoNodeError as e:
      raise StateException("NoNodeError while creating topology",
                        StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NodeExistsError as e:
      raise StateException("NodeExistsError while creating topology",
                        StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2]
    except ZookeeperError as e:
      raise StateException("Zookeeper while creating topology",
                        StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception as e:
      # Just re raise the exception.
      raise

  def delete_topology(self, topologyName):
    path = self.get_topology_path(topologyName)
    LOG.info("Removing topology: {0} from path: {1}".format(
      topologyName, path))
    try:
      self.client.delete(path)
      return True
    except NoNodeError as e:
      raise StateException("NoNodeError while deteling topology",
                        StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NotEmptyError as e:
      raise StateException("NotEmptyError while deleting topology",
                        StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2]
    except ZookeeperError as e:
      raise StateException("Zookeeper while deleting topology",
                        StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception as e:
      # Just re raise the exception.
      raise

  def get_pplan(self, topologyName, callback=None):
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
      "result": None
    }
    if callback:
      isWatching = True
    else:
      # Custom callback to get the topologies
      # right now.
      def callback(data):
        ret["result"] = data

    self._get_pplan_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_pplan_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get pplan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_pplan_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    @self.client.DataWatch(path)
    def watch_pplan(data, stats):
      if data:
        pplan = PhysicalPlan()
        pplan.ParseFromString(data)
        callback(pplan)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def create_pplan(self, topologyName, pplan):
    if not pplan or not pplan.IsInitialized():
      raise StateException("Physical Plan protobuf not init properly",
                        StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2]

    path = self.get_pplan_path(topologyName)
    LOG.info("Adding topology: {0} to path: {1}".format(
      topologyName, path))
    pplanString = pplan.SerializeToString()
    try:
      self.client.create(path, value=pplanString, makepath=True)
      return True
    except NoNodeError as e:
      raise StateException("NoNodeError while creating pplan",
                        StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NodeExistsError as e:
      raise StateException("NodeExistsError while creating pplan",
                        StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2]
    except ZookeeperError as e:
      raise StateException("Zookeeper while creating pplan",
                        StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception as e:
      # Just re raise the exception.
      raise

  def delete_pplan(self, topologyName):
    path = self.get_pplan_path(topologyName)
    LOG.info("Removing topology: {0} from path: {1}".format(
      topologyName, path))
    try:
      self.client.delete(path)
      return True
    except NoNodeError as e:
      raise StateException("NoNodeError while deleting pplan",
                        StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NotEmptyError as e:
      raise StateException("NotEmptyError while deleting pplan",
                        StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2]
    except ZookeeperError as e:
      raise StateException("Zookeeper while deleting pplan",
                        StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception as e:
      # Just re raise the exception.
      raise

  def get_execution_state(self, topologyName, callback=None):
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
      "result": None
    }
    if callback:
      isWatching = True
    else:
      # Custom callback to get the topologies
      # right now.
      def callback(data):
        ret["result"] = data

    self._get_execution_state_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_execution_state_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get execution state with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_execution_state_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    @self.client.DataWatch(path)
    def watch_execution_state(data, stats):
      if data:
        executionState = ExecutionState()
        executionState.ParseFromString(data)
        callback(executionState)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def create_execution_state(self, topologyName, executionState):
    if not executionState or not executionState.IsInitialized():
      raise StateException("Execution State protobuf not init properly",
                        StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2]

    path = self.get_execution_state_path(topologyName)
    LOG.info("Adding topology: {0} to path: {1}".format(
      topologyName, path))
    executionStateString = executionState.SerializeToString()
    try:
      self.client.create(path, value=executionStateString, makepath=True)
      return True
    except NoNodeError as e:
      raise StateException("NoNodeError while creating execution state",
                        StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NodeExistsError as e:
      raise StateException("NodeExistsError while creating execution state",
                        StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2]
    except ZookeeperError as e:
      raise StateException("Zookeeper while creating execution state",
                        StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception as e:
      # Just re raise the exception.
      raise

  def delete_execution_state(self, topologyName):
    path = self.get_execution_state_path(topologyName)
    LOG.info("Removing topology: {0} from path: {1}".format(
      topologyName, path))
    try:
      self.client.delete(path)
      return True
    except NoNodeError as e:
      raise StateException("NoNodeError while deleting execution state",
                        StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NotEmptyError as e:
      raise StateException("NotEmptyError while deleting execution state",
                        StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2]
    except ZookeeperError as e:
      raise StateException("Zookeeper while deleting execution state",
                        StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception as e:
      # Just re raise the exception.
      raise

  def get_tmaster(self, topologyName, callback=None):
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
      "result": None
    }
    if callback:
      isWatching = True
    else:
      # Custom callback to get the topologies
      # right now.
      def callback(data):
        ret["result"] = data

    self._get_tmaster_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_tmaster_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get pplan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_tmaster_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    @self.client.DataWatch(path)
    def watch_tmaster(data, stats):
      if data:
        tmaster = TMasterLocation()
        tmaster.ParseFromString(data)
        callback(tmaster)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def get_scheduler_location(self, topologyName, callback=None):
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
      "result": None
    }
    if callback:
      isWatching = True
    else:
      # Custom callback to get the scheduler location
      # right now.
      def callback(data):
        ret["result"] = data

    self._get_scheduler_location_with_watch(topologyName, callback, isWatching)

    return ret["result"]

  def _get_scheduler_location_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get scheduler location with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_scheduler_location_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    @self.client.DataWatch(path)
    def watch_scheduler_location(data, stats):
      if data:
        scheduler_location = SchedulerLocation()
        scheduler_location.ParseFromString(data)
        callback(scheduler_location)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching
Esempio n. 32
0
class ZookeeperRegistry(Registry):
    _app_config = ApplicationConfig('default_app')
    _connect_state = 'UNCONNECT'

    def __init__(self, zk_hosts, application_config=None):
        Registry.__init__(self)
        if application_config:
            self._app_config = application_config
        self.__zk = KazooClient(hosts=zk_hosts)
        self.__zk.add_listener(self.__state_listener)
        self.__zk.start()

    def __state_listener(self, state):
        if state == KazooState.LOST:
            # Register somewhere that the session was lost
            self._connect_state = state
        elif state == KazooState.SUSPENDED:
            # Handle being disconnected from Zookeeper
            # print 'disconnect from zookeeper'
            self._connect_state = state
        else:
            # Handle being connected/reconnected to Zookeeper
            # print 'connected'
            self._connect_state = state

    def __unquote(self, origin_nodes):
        return (urllib.parse.unquote(child_node) for child_node in origin_nodes
                if child_node)

    def _do_event(self, event):
        # event.path 是类似/dubbo/com.ofpay.demo.api.UserProvider/providers 这样的
        # 如果要删除,必须先把/dubbo/和最后的/providers去掉
        # 将zookeeper中查询到的服务节点列表加入到一个dict中
        # zookeeper中保持的节点url类似如下
        logger.info("receive event is {0}, event state is {1}".format(
            event, event.state))
        provide_name = event.path[7:event.path.rfind('/')]
        if event.state in ['CONNECTED', 'DELETED']:
            children = self.__zk.get_children(event.path,
                                              watch=self.event_listener)
            self._compare_swap_nodes(provide_name, self.__unquote(children))
            configurators_nodes = self._get_provider_configuration(
                provide_name)
            self._set_provider_configuration(provide_name, configurators_nodes)
        # print(self._service_providers)

    def _do_config_event(self, event):
        """
        zk的目录路径为 /dubbo/com.qianmi.pc.api.es.item.EsGoodsQueryProvider/configurators
        :param event:
        :return:
        """
        logger.info("receive config event is {0}, event state is {1}".format(
            event, event.state))
        provide_name = event.path[7:event.path.rfind('/')]
        configurators_nodes = self._get_provider_configuration(provide_name)
        self._set_provider_configuration(provide_name, configurators_nodes)

        # print(self._service_providers)

    def register(self, interface, **kwargs):
        ip = self.__zk._connection._socket.getsockname()[0]
        params = {
            'interface': interface,
            'application': self._app_config.name,
            'application.version': self._app_config.version,
            'category': 'consumer',
            'dubbo': 'dubbo-client-py-1.0.1',
            'environment': self._app_config.environment,
            'method': '',
            'owner': self._app_config.owner,
            'side': 'consumer',
            'pid': os.getpid(),
            'version': '1.0'
        }
        url = 'consumer://{0}/{1}?{2}'.format(ip, interface,
                                              urllib.parse.urlencode(params))
        # print urllib.quote(url, safe='')

        consumer_path = '{0}/{1}/{2}'.format('dubbo', interface, 'consumers')
        self.__zk.ensure_path(consumer_path)
        self.__zk.create(consumer_path + '/' +
                         urllib.parse.quote(url, safe=''),
                         ephemeral=True)

    def subscribe(self, interface, **kwargs):
        """
        监听注册中心的服务上下线
        :param interface: 类似com.ofpay.demo.api.UserProvider这样的服务名
        :return: 无返回
        """
        version = kwargs.get('version', '')
        group = kwargs.get('group', '')
        providers_children = self.__zk.get_children('{0}/{1}/{2}'.format(
            'dubbo', interface, 'providers'),
                                                    watch=self.event_listener)
        logger.debug("watch node is {0}".format(providers_children))
        self.__zk.get_children('{0}/{1}/{2}'.format('dubbo', interface,
                                                    'configurators'),
                               watch=self.configuration_listener)
        # 全部重新添加
        self._compare_swap_nodes(interface, self.__unquote(providers_children))

        configurators_nodes = self._get_provider_configuration(interface)
        self._set_provider_configuration(interface, configurators_nodes)

    def _get_provider_configuration(self, interface):
        """
        获取dubbo自定义配置数据,从"/dubbo/{interface}/configurators" 路径下获取配置
        :param interface:
        :return:
        """
        try:
            configurators_nodes = self.__zk.get_children(
                '{0}/{1}/{2}'.format('dubbo', interface, 'configurators'),
                watch=self.configuration_listener)
            logger.debug(
                "configurators node is {0}".format(configurators_nodes))
            return self.__unquote(configurators_nodes)
        except Exception as e:
            logger.warn("get provider %s configuration error %s", interface,
                        str(e))
Esempio n. 33
0
class ZooKeeper(AbstractDCS):

    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self._client = KazooClient(hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']),
                                   timeout=config['ttl'], connection_retry=KazooRetry(max_delay=1, max_tries=-1,
                                   sleep_func=time.sleep), command_retry=KazooRetry(deadline=config['retry_timeout'],
                                   max_delay=1, max_tries=-1, sleep_func=time.sleep))
        self._client.add_listener(self.session_listener)

        self._fetch_cluster = True

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, host, port):
        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(host, port)
        return max(self.loop_wait - 2, 2)*1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.event.set()

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    @property
    def ttl(self):
        return self._client._session_timeout

    def set_retry_timeout(self, retry_timeout):
        retry = self._client.retry if isinstance(self._client.retry, KazooRetry) else self._client._retry
        retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner, value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self, sync_standby):
        members = []
        for member in self.get_children(self.members_path, self.cluster_watcher):
            watch = member == sync_standby and self.cluster_watcher or None
            data = self.get_node(self.members_path + member, watch)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid)

        # get timeline history
        history = self.get_node(self.history_path, watch=self.cluster_watcher) if self._HISTORY in nodes else None
        history = history and TimelineHistory.from_node(history[1].mzxid, history[0])

        # get last leader operation
        last_leader_operation = self._OPTIME in nodes and self._fetch_cluster and self.get_node(self.leader_optime_path)
        last_leader_operation = last_leader_operation and int(last_leader_operation[0]) or 0

        # get synchronization state
        sync = self.get_node(self.sync_path, watch=self.cluster_watcher) if self._SYNC in nodes else None
        sync = SyncState.from_node(sync and sync[1].version, sync and sync[0])

        # get list of members
        sync_standby = sync.leader == self._name and sync.sync_standby or None
        members = self.load_members(sync_standby) if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if not self._ctl and leader[0] == self._name and client_id is not None \
                    and client_id[0] != leader[1].ephemeralOwner:
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]] or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner, member)
                self._fetch_cluster = member.index == -1

        # failover key
        failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version, failover[0])

        return Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history)

    def _load_cluster(self):
        cluster = self.cluster
        if self._fetch_cluster or cluster is None:
            try:
                cluster = self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')
        return cluster

    def _create(self, path, value, retry=False, ephemeral=False):
        try:
            if retry:
                self._client.retry(self._client.create, path, value, makepath=True, ephemeral=ephemeral)
            else:
                self._client.create_async(path, value, makepath=True, ephemeral=ephemeral).get(timeout=1)
            return True
        except Exception:
            logger.exception('Failed to create %s', path)
        return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path, self._name.encode('utf-8'), retry=True, ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def _set_or_create(self, key, value, index=None, retry=False, do_not_create_empty=False):
        value = value.encode('utf-8')
        try:
            if retry:
                self._client.retry(self._client.set, key, value, version=index or -1)
            else:
                self._client.set_async(key, value, version=index or -1).get(timeout=1)
            return True
        except NoNodeError:
            if do_not_create_empty and not value:
                return True
            elif index is None:
                return self._create(key, value, retry)
            else:
                return False
        except Exception:
            logger.exception('Failed to update %s', key)
        return False

    def set_failover_value(self, value, index=None):
        return self._set_or_create(self.failover_path, value, index)

    def set_config_value(self, value, index=None):
        return self._set_or_create(self.config_path, value, index, retry=True)

    def initialize(self, create_new=True, sysid=""):
        sysid = sysid.encode('utf-8')
        return self._create(self.initialize_path, sysid, retry=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path, sysid)

    def touch_member(self, data, permanent=False):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name, fallback_to_leader=False)
        encoded_data = json.dumps(data, separators=(',', ':')).encode('utf-8')
        if member and (self._client.client_id is not None and member.session != self._client.client_id[0] or
                       not (deep_compare(member.data.get('tags', {}), data.get('tags', {})) and
                            member.data.get('version') == data.get('version') and
                            member.data.get('checkpoint_after_promote') == data.get('checkpoint_after_promote'))):
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except Exception:
                return False
            member = None

        if member:
            if deep_compare(data, member.data):
                return True
        else:
            try:
                self._client.create_async(self.member_path, encoded_data, makepath=True,
                                          ephemeral=not permanent).get(timeout=1)
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path, encoded_data).get(timeout=1)
            return True
        except Exception:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def _write_leader_optime(self, last_operation):
        return self._set_or_create(self.leader_optime_path, last_operation)

    def _update_leader(self):
        return True

    def delete_leader(self):
        self._client.restart()
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except Exception:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete, self.client_path(''), recursive=True)
        except NoNodeError:
            return True

    def set_history_value(self, value):
        return self._set_or_create(self.history_path, value)

    def set_sync_state_value(self, value, index=None):
        return self._set_or_create(self.sync_path, value, index, retry=True, do_not_create_empty=True)

    def delete_sync_state(self, index=None):
        return self.set_sync_state_value("{}", index)

    def watch(self, leader_index, timeout):
        if super(ZooKeeper, self).watch(leader_index, timeout):
            self._fetch_cluster = True
        return self._fetch_cluster
Esempio n. 34
0
class ZooKeeper(object):

    log = logging.getLogger("OpenLabCMD.ZooKeeper")

    # Log zookeeper retry every 10 seconds
    retry_log_rate = 10

    def __init__(self, config=None):
        """
        Zookeeper Client for OpenLab HA management.

        :param config: The config object.
        :type: configparser.ConfigParser.
        """
        self.client = None
        self.config = config
        if self.config and not isinstance(self.config,
                                          configparser.ConfigParser):
            raise exceptions.ClientError("config should be a ConfigParser "
                                         "object.")
        self._last_retry_log = 0

    def _connection_listener(self, state):
        if state == KazooState.LOST:
            self.log.debug("ZooKeeper connection: LOST")
        elif state == KazooState.SUSPENDED:
            self.log.debug("ZooKeeper connection: SUSPENDED")
        else:
            self.log.debug("ZooKeeper connection: CONNECTED")

    def logConnectionRetryEvent(self):
        now = time.monotonic()
        if now - self._last_retry_log >= self.retry_log_rate:
            self.log.warning("Retrying zookeeper connection")
            self._last_retry_log = now

    @property
    def connected(self):
        if self.client is None:
            return False
        return self.client.state == KazooState.CONNECTED

    @property
    def suspended(self):
        if self.client is None:
            return True
        return self.client.state == KazooState.SUSPENDED

    @property
    def lost(self):
        if self.client is None:
            return True
        return self.client.state == KazooState.LOST

    def connect(self, hosts=None, timeout=None, read_only=False):
        if not hosts:
            if not self.config:
                raise exceptions.ClientError('Either config object or hosts '
                                             'string should be provided.')
            try:
                hosts = hosts or self.config.get('ha', 'zookeeper_hosts')
            except (configparser.NoOptionError, configparser.NoSectionError):
                raise exceptions.ClientError(
                    "The config doesn't contain [ha]zookeeper_hosts option.")

        if not timeout:
            timeout = self.config.get('ha', 'zookeeper_connect_timeout',
                                      fallback=5)
        retry_limit = self.config.get('ha', 'zookeeper_connect_retry_limit',
                                      fallback=5)
        try:
            timeout = int(timeout)
        except ValueError:
            raise exceptions.ClientError("zookeeper_connect_timeout "
                                         "should be int-like format.")
        if timeout <= 0:
            raise exceptions.ClientError("zookeeper_connect_timeout "
                                         "should be larger than 0.")

        if self.client is None:
            self.client = KazooClient(hosts=hosts, timeout=timeout,
                                      read_only=read_only)
            self.client.add_listener(self._connection_listener)
            # Manually retry initial connection attempt
            tried_times = 0
            while tried_times < retry_limit:
                try:
                    self.client.start(1)
                    break
                except Exception:
                    self.logConnectionRetryEvent()
                tried_times += 1
                if tried_times == retry_limit:
                    self.client = None
                    raise exceptions.ClientError(
                        "Tried %s times, failed connecting "
                        "zookeeper." % retry_limit)

    def disconnect(self):
        if self.client is not None and self.client.connected:
            self.client.stop()
            self.client.close()
            self.client = None

    def _client_check_wrapper(func):
        def wrapper(self, *args, **kwargs):
            if not self.client:
                raise exceptions.ClientError(
                    "Should call connect function first to initialise "
                    "zookeeper client")
            return func(self, *args, **kwargs)
        return wrapper

    @_client_check_wrapper
    def list_nodes(self, with_zk=True, node_role_filter=None,
                   node_type_filter=None):
        if node_role_filter:
            if isinstance(node_role_filter, str):
                node_role_filter = [node_role_filter]
            if not isinstance(node_role_filter, list):
                raise exceptions.ValidationError("node_role_filter should be "
                                                 "a list or string.")
        if node_type_filter:
            if isinstance(node_type_filter, str):
                node_type_filter = [node_type_filter]
            if not isinstance(node_type_filter, list):
                raise exceptions.ValidationError("node_type_filter should be "
                                                 "a list or string.")

        path = '/ha'
        try:
            nodes_objs = []
            for exist_node in self.client.get_children(path):
                if exist_node == 'configuration':
                    continue
                if not with_zk and 'zookeeper' in exist_node:
                    continue
                node_obj = self.get_node(exist_node)
                if node_role_filter and node_obj.role not in node_role_filter:
                    continue
                if node_type_filter and node_obj.type not in node_type_filter:
                    continue
                nodes_objs.append(node_obj)
        except kze.NoNodeError:
            return []
        return sorted(nodes_objs, key=lambda x: x.name)

    @_client_check_wrapper
    def get_node(self, node_name):
        try:
            node_bytes = self.client.get('/ha/%s' % node_name)
            node_obj = node.Node.from_zk_bytes(node_bytes)
            return node_obj
        except kze.NoNodeError:
            raise exceptions.ClientError('Node %s not found.' % node_name)

    def _init_service(self, node_name, node_type):
        path = '/ha/%s' % node_name
        master_service_path = path + '/master'
        slave_service_path = path + '/slave'
        zookeeper_service_path = path + '/zookeeper'

        self.client.create(master_service_path)
        self.client.create(slave_service_path)
        self.client.create(zookeeper_service_path)

        for node_role, all_services in service.service_mapping.items():
            new_service_path = path + '/%s' % node_role
            try:
                node_services = all_services[node_type]
            except KeyError:
                continue
            for service_type, service_names in node_services.items():
                service_class = (service.NecessaryService if
                                 service_type == 'necessary' else
                                 service.UnnecessaryService)
                for service_name in service_names:
                    new_service = service_class(service_name, node_name)
                    self.client.create(
                        new_service_path + '/%s' % service_name,
                        value=new_service.to_zk_bytes())

    @_client_check_wrapper
    def create_node(self, name, role, n_type, ip):
        existed_nodes = self.list_nodes()
        for existed_node in existed_nodes:
            if existed_node.role == role and existed_node.role == n_type:
                raise exceptions.ClientError(
                    "The role and type of the node should be unique.")

        path = '/ha/%s' % name
        new_node = node.Node(name, role, n_type, ip)
        try:
            self.client.create(path,
                               value=new_node.to_zk_bytes(),
                               makepath=True)
        except kze.NodeExistsError:
            raise exceptions.ClientError("The node %s is already existed."
                                         % name)
        self._init_service(name, n_type)
        node_obj = self.get_node(name)
        return node_obj

    @_client_check_wrapper
    def update_node(self, node_name, maintain=None, role=None, **kwargs):
        path = '/ha/%s' % node_name
        node_obj = self.get_node(node_name)
        if maintain is not None:
            if maintain:
                if node_obj.status == node.NodeStatus.UP:
                    node_obj.status = node.NodeStatus.MAINTAINING
                else:
                    raise exceptions.ClientError(
                        "The node must be in 'up' status when trying to "
                        "maintain it.")
            else:
                if node_obj.status == node.NodeStatus.MAINTAINING:
                    node_obj.status = node.NodeStatus.UP
                    node_obj.heartbeat = datetime.datetime.utcnow().strftime(
                        '%Y-%m-%d %H:%M:%S')
                else:
                    raise exceptions.ClientError(
                        "The node must be in 'maintaining' status when trying "
                        "to un-maintain it.")
        if role:
            node_obj.role = role
        switch_status = kwargs.get('switch_status')
        if switch_status is not None:
            if switch_status.lower() not in ['start', 'end']:
                raise exceptions.ClientError(
                    "switch_status must be 'start', 'end'")
        node_obj.update(kwargs)
        self.client.set(path, value=node_obj.to_zk_bytes())

        node_obj = self.get_node(node_name)
        return node_obj

    @_client_check_wrapper
    def delete_node(self, node_name):
        self.get_node(node_name)
        path = '/ha/%s' % node_name
        self.client.delete(path, recursive=True)

    @_client_check_wrapper
    def list_services(self, node_name_filter=None, node_role_filter=None,
                      status_filter=None):
        """
        List the services in the HA deployment.
        :param node_name_filter: The node filter.
        :type node_name_filter: list or string.
        :param node_role_filter: The node filter.
        :type node_role_filter: list or string.
        :param status_filter: The status filter.
        :type status_filter: list or string.
        :return: the services list.
        """
        if node_name_filter:
            if isinstance(node_name_filter, str):
                node_name_filter = [node_name_filter]
            if not isinstance(node_name_filter, list):
                raise exceptions.ValidationError("node_name_filter should be "
                                                 "a list or string.")
        if node_role_filter:
            if isinstance(node_role_filter, str):
                node_role_filter = [node_role_filter]
            if not isinstance(node_role_filter, list):
                raise exceptions.ValidationError("node_role_filter should be "
                                                 "a list or string.")
        if status_filter:
            if isinstance(status_filter, str):
                status_filter = [status_filter]
            if not isinstance(status_filter, list):
                raise exceptions.ValidationError("status_filter should be "
                                                 "a list or string.")

        result = []
        for exist_node in self.list_nodes():
            if node_name_filter and exist_node.name not in node_name_filter:
                continue
            if node_role_filter and exist_node.role not in node_role_filter:
                continue
            path = '/ha/%s/%s' % (exist_node.name, exist_node.role)
            for service_name in self.client.get_children(path):
                service_path = path + '/' + service_name
                service_bytes = self.client.get(service_path)
                service_obj = service.Service.from_zk_bytes(service_bytes)
                if status_filter and service_obj.status not in status_filter:
                    continue
                result.append(service_obj)
        return sorted(result, key=lambda x: x.node_name)

    @_client_check_wrapper
    def get_service(self, service_name, node_name):
        service_node = self.get_node(node_name)
        path = '/ha/%s/%s/%s' % (service_node.name, service_node.role,
                                 service_name)
        try:
            service_bytes = self.client.get(path)
        except kze.NoNodeError:
            raise exceptions.ClientError('Service %s not found.' %
                                         service_name)
        service_obj = service.Service.from_zk_bytes(service_bytes)
        return service_obj

    @_client_check_wrapper
    def update_service(self, service_name, node_name, alarmed=None,
                       restarted=None, status=None, **kwargs):
        old_service = self.get_service(service_name, node_name)
        service_node = self.get_node(node_name)
        path = '/ha/%s/%s/%s' % (service_node.name, service_node.role,
                                 service_name)
        current_time = datetime.datetime.utcnow().isoformat()

        if alarmed is not None:
            if not isinstance(alarmed, bool):
                raise exceptions.ValidationError('alarmed should be boolean '
                                                 'value.')
            old_service.alarmed = alarmed
            if alarmed:
                old_service.alarmed_at = current_time
        if restarted is not None:
            if not isinstance(restarted, bool):
                raise exceptions.ValidationError('restarted should be '
                                                 'boolean value.')
            old_service.restarted = restarted
            if restarted:
                old_service.restarted_at = current_time
        if status:
            if status not in service.ServiceStatus().all_status:
                raise exceptions.ValidationError(
                    'status should be in %s.' %
                    service.ServiceStatus().all_status)
            old_service.status = status

        old_service.update(kwargs)
        self.client.set(path, value=old_service.to_zk_bytes())

        new_service = self.get_service(service_name, node_name)
        return new_service

    @_client_check_wrapper
    def switch_master_and_slave(self):
        """Mark node's switch status to start.

        This func is called by labkeeper deploy tool. So that operators can
        switch master-slave role by hand. Once health checker find that all
        nodes' switch status are `start`, it will start to switch cluster.
        """
        for node in self.list_nodes():
            if node.type != 'zookeeper':
                self.update_node(node.name, switch_status='start')

    @_client_check_wrapper
    def check_and_repair_deployment_sg(self, is_dry_run=False):
        """Check and Repair current HA deployment Security Group configuration

        This func is called by labkeeper deploy tool. So that operators can
        check and repair exist deployment from zookeeper. The function is
        for checking Cloud Security Group configuration.
        """
        deploy_map = {}
        cloud_provide_rules = {}
        unexpect_rules = {}
        for node in self.list_nodes():
            ha_ports_cp = copy.deepcopy(constants.HA_PORTS)
            if node.type == 'nodepool':
                ha_ports_cp.remove(constants.MYSQL_HA_PORT)
            elif node.type == 'zuul':
                for p in constants.ZOOKEEPER_HA_PORTS:
                    ha_ports_cp.remove(p)
            elif node.type == 'zookeeper':
                ha_ports_cp.remove(constants.RSYNCD_HA_PORT)
                ha_ports_cp.remove(constants.MYSQL_HA_PORT)
            if node.name.split("-")[0] not in deploy_map:
                deploy_map[node.name.split("-")[0]] = {'nodes': [node]}
                cloud_provide_rules[node.name.split("-")[0]] = {
                    node.ip + '/32': ha_ports_cp}
            else:
                deploy_map[node.name.split("-")[0]]['nodes'].append(node)
                cloud_provide_rules[node.name.split("-")[0]][
                    node.ip + '/32'] = ha_ports_cp

        # Fit current expect_rules
        expect_rules = {}
        sg_map = {}
        cloud_names = list(cloud_provide_rules.keys())
        for cloud_name, ip_dict in cloud_provide_rules.items():
            c_names = copy.deepcopy(cloud_names)
            c_names.remove(cloud_name)
            expect_rules[cloud_name] = copy.deepcopy(ip_dict)
            if len(cloud_provide_rules[cloud_name].keys()) > 1:
                for c_name in c_names:
                    expect_rules[cloud_name].update(
                        copy.deepcopy(cloud_provide_rules[c_name]))
            else:
                for c_name in c_names:
                    for ip in cloud_provide_rules[c_name].keys():
                        if 2888 in cloud_provide_rules[c_name][ip]:
                            zk_ha_ports = copy.deepcopy(
                                constants.ZOOKEEPER_HA_PORTS)
                            expect_rules[cloud_name][ip] = zk_ha_ports
                        else:
                            expect_rules[cloud_name][ip] = [2181]

        for cloud_name, nodes_dict in deploy_map.items():
            net_client = os_client_config.make_rest_client(
                'network', cloud=cloud_name)
            for sg_name in constants.HA_SGs:
                url = "/security-groups?name=%s" % sg_name
                resp = net_client.get(url)
                if resp.status_code != 200:
                    raise exceptions.ClientError(
                        'Security group %(sg_name)s not found on '
                        'cloud %(cloud_name)s.' % {'sg_name': sg_name,
                                                   'cloud_name': cloud_name})
                sgr_data = resp.json()['security_groups'][0]
                if cloud_name not in sg_map:
                    sg_map[cloud_name] = resp.json()[
                        'security_groups'][0]['id']
                for rule in sgr_data['security_group_rules']:
                    if rule['direction'] != 'ingress':
                        continue

                    is_specified_1_port = (
                            rule['port_range_min'] == rule['port_range_max'])
                    is_ipv4 = rule['ethertype'] == 'IPv4'
                    is_tcp = rule['protocol'] == 'tcp'

                    if not expect_rules[cloud_name].get(
                            rule['remote_ip_prefix']):
                        if cloud_name not in unexpect_rules:
                            unexpect_rules[cloud_name] = [
                                (rule['remote_ip_prefix'],
                                 rule['port_range_min'], rule['id'])]
                        else:
                            unexpect_rules[cloud_name].append(
                                (rule['remote_ip_prefix'],
                                 rule['port_range_min'], rule['id']))
                    else:
                        if (is_specified_1_port and is_ipv4 and is_tcp and
                                rule['port_range_min'] in expect_rules[
                                    cloud_name][rule['remote_ip_prefix']]):
                            expect_rules[cloud_name][
                                rule['remote_ip_prefix']].remove(
                                rule['port_range_min'])
                            if len(expect_rules[cloud_name][
                                       rule['remote_ip_prefix']]) ==0:
                                expect_rules[cloud_name].pop(
                                    rule['remote_ip_prefix'])
                        else:
                            if cloud_name not in unexpect_rules:
                                unexpect_rules[cloud_name] = [
                                    (rule['remote_ip_prefix'],
                                     rule['port_range_min'], rule['id'])]
                            else:
                                unexpect_rules[cloud_name].append((
                                    rule['remote_ip_prefix'],
                                    rule['port_range_min'], rule['id']))

        if not is_dry_run:
            # analysis expect_rules
            for cloud_name, ip_dict in expect_rules.items():
                if not ip_dict:
                    print("Cloud %s: PASSED" % cloud_name)
                    continue

                print("Recover security group rules for cloud %s:" %
                      cloud_name)
                # Here means the sg lacks SG_rule settings
                net_client = os_client_config.make_rest_client(
                    'network', cloud=cloud_name)
                for ip, ports in ip_dict.items():
                    req = {
                        "security_group_rule": {
                            "direction": "ingress",
                            "ethertype": "IPv4",
                            "protocol": "tcp",
                            "security_group_id": sg_map[cloud_name],
                            "remote_ip_prefix": ip
                        }
                    }
                    for port in ports:
                        req["security_group_rule"].update({
                            "port_range_min": port,
                            "port_range_max": port
                        })
                        resp = net_client.post('/security-group-rules',
                                               json=req)
                        if resp.status_code != 201:
                            raise exceptions.ClientError(
                                'Failed to create security group rule on '
                                'cloud %(cloud_name)s with summary '
                                '%(ip)s %(port)s'
                                % {'cloud_name': cloud_name, 'ip': ip,
                                   'port': port})
                        print("Create new sg_rule, summary %(ip)s %(port)s" % {
                            "ip": ip,
                            "port": str(port)
                        })

            # remove unexpect sg_rules
            for cloud_name, ip_port_tuple_list in unexpect_rules.items():
                net_client = os_client_config.make_rest_client(
                    'network', cloud=cloud_name)
                print("Unexpect security group rules clean for cloud %s:" %
                      cloud_name)
                for ip_port_tuple in ip_port_tuple_list:
                    url = "/security-group-rules/%s" % ip_port_tuple[2]
                    resp = net_client.delete(url)
                    if resp.status_code != 204:
                        raise exceptions.ClientError(
                            'Failed to delete security group rule '
                            '%(rule_id)s on cloud %(cloud_name)s'
                            % {'cloud_name': cloud_name,
                               'rule_id': ip_port_tuple[2]})
                    print("Remove sg_rule %(rule_id)s, summary %(ip)s "
                          "%(port)s" % {
                        "rule_id": ip_port_tuple[2],
                        "ip": ip_port_tuple[0],
                        "port": str(ip_port_tuple[1])
                    })
        else:
            for cloud_name, ip_dict in expect_rules.items():
                if not ip_dict:
                    print("Cloud %s: PASSED" % cloud_name)
                    continue
                print("Found lack security group rules in cloud %s" %
                      cloud_name)
                for ip, ports in ip_dict.items():
                    print("    Need to create new rule for (ip)s (ports)s" % {
                        "ip": ip,
                        "ports": str(ports)
                    })

            # remove unexpect sg_rules
            for cloud_name, ip_port_tuple_list in unexpect_rules.items():
                print("Found unexpect security group rules clean for "
                      "cloud %s:" % cloud_name)
                for ip_port_tuple in ip_port_tuple_list:
                    print("    Need to remove sg_rule %(rule_id)s, "
                          "summary %(ip)s %(port)s" % {
                        "rule_id": ip_port_tuple[2],
                        "ip": ip_port_tuple[0],
                        "port": str(ip_port_tuple[1])
                    })

    def _init_ha_configuration(self):
        path = '/ha/configuration'
        self.client.create(path,
                           value=json.dumps(CONFIGURATION_DICT).encode('utf8'),
                           makepath=True)

    @_client_check_wrapper
    def list_configuration(self):
        path = '/ha/configuration'
        try:
            config_bytes = self.client.get(path)
        except kze.NoNodeError:
            self._init_ha_configuration()
            config_bytes = self.client.get(path)
        return json.loads(config_bytes[0].decode('utf8'))

    @_client_check_wrapper
    def update_configuration(self, name, value):
        path = '/ha/configuration'
        configs = self.list_configuration()
        if name not in configs.keys():
            raise exceptions.ClientError('There is not option %s' % name)
        configs[name] = value
        self.client.set(path, json.dumps(configs).encode('utf8'))
Esempio n. 35
0
class ZooKeeper(AbstractDCS):
    def __init__(self, name, config):
        super(ZooKeeper, self).__init__(name, config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self.exhibitor = None
        if 'exhibitor' in config:
            exhibitor = config['exhibitor']
            interval = exhibitor.get('poll_interval', 300)
            self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'],
                                                       exhibitor['port'],
                                                       poll_interval=interval)
            hosts = self.exhibitor.zookeeper_hosts

        self.client = KazooClient(hosts=hosts,
                                  timeout=(config.get('session_timeout', None)
                                           or 30),
                                  command_retry={
                                      'deadline':
                                      (config.get('reconnect_timeout', None)
                                       or 10),
                                      'max_delay':
                                      1,
                                      'max_tries':
                                      -1
                                  },
                                  connection_retry={
                                      'max_delay': 1,
                                      'max_tries': -1
                                  })
        self.client.add_listener(self.session_listener)
        self.cluster_event = self.client.handler.event_object()

        self.fetch_cluster = True
        self.members = []
        self.leader = None
        self.last_leader_operation = 0

        self.client.start(None)

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self.fetch_cluster = True
        self.cluster_event.set()

    def get_node(self, name, watch=None):
        try:
            return self.client.get(self.client_path(name), watch)
        except NoNodeError:
            pass
        except:
            logger.exception('get_node')
        return None

    @staticmethod
    def member(name, value, znode):
        conn_url, api_url = parse_connection_string(value)
        return Member(znode.mzxid, name, conn_url, api_url, None, None)

    def load_members(self):
        members = []
        for member in self.client.get_children(self.client_path('/members'),
                                               self.cluster_watcher):
            data = self.get_node('/members/' + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self.cluster_event.clear()
        leader = self.get_node('/leader', self.cluster_watcher)
        self.members = self.load_members()
        if leader:
            if leader[0] == self._name:
                client_id = self.client.client_id
                if client_id is not None and client_id[0] != leader[
                        1].ephemeralOwner:
                    logger.info(
                        'I am leader but not owner of the session. Removing leader node'
                    )
                    self.client.delete(self.client_path('/leader'))
                    leader = None

            if leader:
                for member in self.members:
                    if member.name == leader[0]:
                        leader = member
                        self.fetch_cluster = False
                        break
            if not isinstance(leader, Member):
                leader = Member(-1, leader, None, None, None, None)
        self.leader = leader
        if self.fetch_cluster:
            last_leader_operation = self.get_node('/optime/leader')
            if last_leader_operation:
                self.last_leader_operation = int(last_leader_operation[0])

    def get_cluster(self):
        if self.exhibitor and self.exhibitor.poll():
            self.client.set_hosts(self.exhibitor.zookeeper_hosts)

        if self.fetch_cluster:
            try:
                self.client.retry(self._inner_load_cluster)
            except:
                logger.exception('get_cluster')
                self.session_listener(KazooState.LOST)
                raise ZooKeeperError('ZooKeeper in not responding properly')
        return Cluster(True, self.leader, self.last_leader_operation,
                       self.members)

    def _create(self, path, value, **kwargs):
        try:
            self.client.retry(self.client.create, self.client_path(path),
                              value, **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self):
        ret = self._create('/leader',
                           self._name,
                           makepath=True,
                           ephemeral=True)
        ret or logger.info('Could not take out TTL lock')
        return ret

    def race(self, path):
        return self._create(path, self._name, makepath=True)

    def touch_member(self, connection_string, ttl=None):
        for m in self.members:
            if m.name == self._name:
                return True
        path = self.client_path('/members/' + self._name)
        try:
            self.client.retry(self.client.create,
                              path,
                              connection_string,
                              makepath=True,
                              ephemeral=True)
            return True
        except NodeExistsError:
            try:
                self.client.retry(self.client.delete, path)
                self.client.retry(self.client.create,
                                  path,
                                  connection_string,
                                  makepath=True,
                                  ephemeral=True)
                return True
            except:
                logger.exception('touch_member')
        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def update_leader(self, state_handler):
        last_operation = state_handler.last_operation()
        if last_operation != self.last_leader_operation:
            self.last_leader_operation = last_operation
            path = self.client_path('/optime/leader')
            try:
                self.client.retry(self.client.set, path, last_operation)
            except NoNodeError:
                try:
                    self.client.retry(self.client.create,
                                      path,
                                      last_operation,
                                      makepath=True)
                except:
                    logger.exception('Failed to create %s', path)
            except:
                logger.exception('Failed to update %s', path)
        return True

    def delete_leader(self):
        if isinstance(self.leader, Member) and self.leader.name == self._name:
            self.client.delete(self.client_path('/leader'))

    def sleep(self, timeout):
        self.cluster_event.wait(timeout)
        if self.cluster_event.isSet():
            self.fetch_cluster = True
Esempio n. 36
0
class AnalyticsDiscovery(gevent.Greenlet):
    def _sandesh_connection_info_update(self, status, message):

        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER,
                               name=self._svc_name,
                               status=new_conn_state,
                               server_addrs=self._zk_server.split(','),
                               message=message)

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN
                and new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' % (message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state
                and new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state

    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Analytics Discovery listen %s" % str(state))
        if state == KazooState.CONNECTED:
            self._sandesh_connection_info_update(
                status='UP', message='Connection to Zookeeper re-established')
            self._logger.error("Analytics Discovery to publish %s" %
                               str(self._pubinfo))
            self._reconnect = True
        elif state == KazooState.LOST:
            self._logger.error("Analytics Discovery connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all
            # over again
            self._sandesh_connection_info_update(
                status='DOWN', message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Analytics Discovery connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(
                status='INIT',
                message='Connection to zookeeper lost. Retrying')

    def _zk_datawatch(self, watcher, child, data, stat, event="unknown"):
        self._logger.error(\
                "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                (watcher, child, data, event))
        if data:
            data_dict = json.loads(data)
            self._wchildren[watcher][child] = OrderedDict(
                sorted(data_dict.items()))
        else:
            if child in self._wchildren[watcher]:
                del self._wchildren[watcher][child]
        if self._data_watchers[watcher]:
            self._pendingcb.add(watcher)

    def _zk_watcher(self, watcher, children):
        self._logger.error("Analytics Discovery Watcher %s Children %s" %
                           (watcher, children))
        self._reconnect = True

    def __init__(self,
                 logger,
                 zkservers,
                 svc_name,
                 inst,
                 data_watchers={},
                 child_watchers={},
                 zpostfix="",
                 freq=10):
        gevent.Greenlet.__init__(self)
        self._svc_name = svc_name
        self._inst = inst
        self._zk_server = zkservers
        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(
            status='INIT', message='Connection to Zookeeper initialized')
        self._zkservers = zkservers
        self._zk = None
        self._pubinfo = None
        self._publock = Semaphore()
        self._data_watchers = data_watchers
        self._child_watchers = child_watchers
        self._wchildren = {}
        self._pendingcb = set()
        self._zpostfix = zpostfix
        self._basepath = "/analytics-discovery-" + self._zpostfix
        self._reconnect = None
        self._freq = freq

    def publish(self, pubinfo):

        # This function can be called concurrently by the main AlarmDiscovery
        # processing loop as well as by clients.
        # It is NOT re-entrant
        self._publock.acquire()

        self._pubinfo = pubinfo
        if self._conn_state == ConnectionStatus.UP:
            try:
                self._logger.error("ensure %s" %
                                   (self._basepath + "/" + self._svc_name))
                self._logger.error("zk state %s (%s)" %
                                   (self._zk.state, self._zk.client_state))
                self._zk.ensure_path(self._basepath + "/" + self._svc_name)
                self._logger.error("check for %s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))
                if pubinfo is not None:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._zk.set("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo)
                    else:
                        self._zk.create("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo, ephemeral=True)
                else:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._logger.error("withdrawing published info!")
                        self._zk.delete("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))

            except Exception as ex:
                template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                self._sandesh_connection_info_update(
                    status='DOWN',
                    message='Reconnect to Zookeeper to handle exception')
                self._reconnect = True
        else:
            self._logger.error("Analytics Discovery cannot publish while down")
        self._publock.release()

    def _run(self):
        while True:
            self._logger.error("Analytics Discovery zk start")
            self._zk = KazooClient(hosts=self._zkservers, timeout=60.0)
            self._zk.add_listener(self._zk_listen)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_listen)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._svc_name))
                finally:
                    self._zk = None
                gevent.sleep(1)

        try:
            # Update connection info
            self._sandesh_connection_info_update(
                status='UP', message='Connection to Zookeeper established')
            self._reconnect = False
            # Done connecting to ZooKeeper

            for wk in self._data_watchers.keys():
                self._zk.ensure_path(self._basepath + "/" + wk)
                self._wchildren[wk] = {}
                self._zk.ChildrenWatch(self._basepath + "/" + wk,
                                       partial(self._zk_watcher, wk))
            for wk in self._child_watchers.keys():
                self._zk.ensure_path(self._basepath + "/" + wk)
                self._zk.ChildrenWatch(self._basepath + "/" + wk,
                                       self._child_watchers[wk])
            # Trigger the initial publish
            self._reconnect = True

            while True:
                try:
                    if not self._reconnect:
                        pending_list = list(self._pendingcb)
                        self._pendingcb = set()
                        for wk in pending_list:
                            if self._data_watchers[wk]:
                                self._data_watchers[wk](\
                                        sorted(self._wchildren[wk].values()))

                    # If a reconnect happens during processing, don't lose it
                    while self._reconnect:
                        self._logger.error("Analytics Discovery %s reconnect" \
                                % self._svc_name)
                        self._reconnect = False
                        self._pendingcb = set()
                        self.publish(self._pubinfo)

                        for wk in self._data_watchers.keys():
                            self._zk.ensure_path(self._basepath + "/" + wk)
                            children = self._zk.get_children(self._basepath +
                                                             "/" + wk)

                            old_children = set(self._wchildren[wk].keys())
                            new_children = set(children)

                            # Remove contents for the children who are gone
                            # (DO NOT remove the watch)
                            for elem in old_children - new_children:
                                del self._wchildren[wk][elem]

                            # Overwrite existing children, or create new ones
                            for elem in new_children:
                                # Create a watch for new children
                                if elem not in self._wchildren[wk]:
                                    self._zk.DataWatch(self._basepath + "/" + \
                                            wk + "/" + elem,
                                            partial(self._zk_datawatch, wk, elem))

                                data_str, _ = self._zk.get(\
                                        self._basepath + "/" + wk + "/" + elem)
                                data_dict = json.loads(data_str)
                                self._wchildren[wk][elem] = \
                                        OrderedDict(sorted(data_dict.items()))

                                self._logger.error(\
                                    "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                                    (wk, elem, self._wchildren[wk][elem], "GET"))
                            if self._data_watchers[wk]:
                                self._data_watchers[wk](sorted(
                                    self._wchildren[wk].values()))

                    gevent.sleep(self._freq)
                except gevent.GreenletExit:
                    self._logger.error("Exiting AnalyticsDiscovery for %s" % \
                            self._svc_name)
                    self._zk.remove_listener(self._zk_listen)
                    gevent.sleep(1)
                    try:
                        self._zk.stop()
                    except:
                        self._logger.error("Stopping kazooclient failed")
                    else:
                        self._logger.error("Stopping kazooclient successful")
                    try:
                        self._zk.close()
                    except:
                        self._logger.error("Closing kazooclient failed")
                    else:
                        self._logger.error("Closing kazooclient successful")
                    break

                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                    self._reconnect = True

        except Exception as ex:
            template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}"
            messag = template.format(type(ex).__name__, ex.args)
            self._logger.error("%s : traceback %s for %s info %s" % \
                    (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
            raise SystemExit
Esempio n. 37
0
        print("LOST")
    elif state == KazooState.SUSPENDED:
        # Handle being disconnected from Zookeeper
        print("SUSPENDED")
    else:
        print("HI")
        # Handle being connected/reconnected to Zookeeper


zk = KazooClient(hosts='zoo:2181')
zk.start()

zk.ensure_path("/worker")
zk.ensure_path("/master")

zk.add_listener(my_listener)

# children=[]
if (zk.exists("/worker")):
    children = zk.get_children("/worker")
    print(children)


@zk.DataWatch("/master")
def watch_node(data, stat, event):
    # pid=
    print("data c", data)
    children = zk.get_children("/worker")
    # print(children)

    print("Status >>>>>>>>>>>>>>>", stat)
Esempio n. 38
0
class ZookeeperWatcher(object):
    zoo_client = None  # The KazooClient to manage the config
    point_path = None  # Zookeeper path to pointed to file
    pointed_at_expired = None  # is True when the assignment has been set to
    # None but we cannot remove the config listener
    valid_handler = None  # the function to call when the validity changes
    config_handler = None  # the function to call when the config changes
    error_handler = None  # the function to call when an error occurs in reading
    valid_file = False  # the current state of the ConfigWatcher with ZK
    do_not_restart = False  # used when closing via ^C
    old_data = ''  # The current file contents, to see if a change occurred
    old_pointed = ''  # the current pointed path, to see if change occurred

    INVALID_PATH = "Invalid pointer path"
    INVALID_GET = "Invalid get on file path"
    BAD_CONNECTION = "Connection interrupted with Zookeeper, re-establishing"

    def __init__(self,
                 hosts,
                 filepath,
                 valid_handler=None,
                 config_handler=None,
                 error_handler=None,
                 pointer=False,
                 ensure=False,
                 valid_init=True):
        '''
        Zookeeper file watcher, used to tell a program their zookeeper file has
        changed. Can be used to watch a single file, or both a file and path of
        its contents. Manages all connections, drops, reconnections for you.

        @param hosts: The zookeeper hosts to use
        @param filepath: The full path to the file to watch
        @param valid_handler: The method to call for a 'is valid' state change
        @param config_handler: The method to call when a content change occurs
        @param error_handler: The method to call when an error occurs
        @param pointer: Set to true if the file contents are actually a path to
                        another zookeeper file, where the real config resides
        @param ensure: Set to true for the ZooWatcher to create the watched file
        @param valid_init: Ensure the client can connect to Zookeeper first try

        Ex 1. /stuff/A: "stuff I care about"
        Ex 2. /stuff/A: "/other/stuff", /other/stuff: "contents I care about"
            - in Ex 2 you care about /other/stuff contents
              but are only aware of your assignment /stuff/A

        You can use this class as any combination of event driven or polling.
        Polling:
            In the main loop of your program, check if is_valid() is
            True, otherwise clear your contents as there is some ZK error.
        Event:
            You will be notified via the various handlers when content changes.
        '''
        self.hosts = hosts
        self.my_file = filepath
        self.pointer = pointer
        self.ensure = ensure
        self.valid_handler = valid_handler
        self.config_handler = config_handler
        self.error_handler = error_handler

        if valid_init:
            # this will throw an exception if it can't start right away
            self.zoo_client = KazooClient(hosts=self.hosts)
            self.zoo_client.start()

        self.threaded_start(no_init=True)

    def threaded_start(self, no_init=False):
        '''
        Spawns a worker thread to set up the zookeeper connection
        '''
        thread = Thread(target=self.init_connections,
                        kwargs={'no_init': no_init})
        thread.setDaemon(True)
        thread.start()
        thread.join()

    def init_connections(self, no_init=False):
        '''
        Sets up the initial Kazoo Client and watches
        '''
        success = False
        self.set_valid(False)

        if not no_init:
            if self.zoo_client:
                self.zoo_client.remove_listener(self.state_listener)
                self.old_data = ''
                self.old_pointed = ''

            while not success:
                try:
                    if self.zoo_client is None:
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                    else:
                        # self.zoo_client.stop()
                        self.zoo_client._connection.connection_stopped.set()
                        self.zoo_client.close()
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                except Exception as e:
                    log.error("ZKWatcher Exception: " + e.message)
                    sleep(1)
                    continue

                self.setup()
                success = self.update_file(self.my_file)
                sleep(5)
        else:
            self.setup()
            self.update_file(self.my_file)

    def setup(self):
        '''
        Ensures the path to the watched file exists and we have a state
        listener
        '''
        self.zoo_client.add_listener(self.state_listener)

        if self.ensure:
            self.zoo_client.ensure_path(self.my_file)

    def state_listener(self, state):
        '''
        Restarts the session if we get anything besides CONNECTED
        '''
        if state == KazooState.SUSPENDED:
            self.set_valid(False)
            self.call_error(self.BAD_CONNECTION)
        elif state == KazooState.LOST and not self.do_not_restart:
            self.threaded_start()
        elif state == KazooState.CONNECTED:
            # This is going to throw a SUSPENDED kazoo error
            # which will cause the sessions to be wiped and re established.
            # Used b/c of massive connection pool issues
            self.zoo_client.stop()

    def is_valid(self):
        '''
        @return: True if the currently watch file is valid
        '''
        return self.valid_file

    def ping(self):
        '''
        Simple command to test if the zookeeper session is able to connect
        at this very moment
        '''
        try:
            # dummy ping to ensure we are still connected
            self.zoo_client.server_version()
            return True
        except KazooException:
            return False

    def close(self, kill_restart=True):
        '''
        Use when you would like to close everything down
        @param kill_restart= Prevent kazoo restarting from occurring
        '''
        self.do_not_restart = kill_restart
        self.zoo_client.stop()
        self.zoo_client.close()

    def get_file_contents(self, pointer=False):
        '''
        Gets any file contents you care about. Defaults to the main file
        @param pointer: The the contents of the file pointer, not the pointed
        at file
        @return: A string of the contents
        '''
        if self.pointer:
            if pointer:
                return self.old_pointed
            else:
                return self.old_data
        else:
            return self.old_data

    def watch_file(self, event):
        '''
        Fired when changes made to the file
        '''
        if not self.update_file(self.my_file):
            self.threaded_start()

    def update_file(self, path):
        '''
        Updates the file watcher and calls the appropriate method for results
        @return: False if we need to keep trying the connection
        '''
        try:
            # grab the file
            result, stat = self.zoo_client.get(path, watch=self.watch_file)
            result = result.decode('utf-8')
        except ZookeeperError:
            self.set_valid(False)
            self.call_error(self.INVALID_GET)
            return False

        if self.pointer:
            if result is not None and len(result) > 0:
                self.pointed_at_expired = False
                # file is a pointer, go update and watch other file
                self.point_path = result
                if self.compare_pointer(result):
                    self.update_pointed()
            else:
                self.pointed_at_expired = True
                self.old_pointed = ''
                self.old_data = ''
                self.set_valid(False)
                self.call_error(self.INVALID_PATH)
        else:
            # file is not a pointer, return contents
            if self.compare_data(result):
                self.call_config(result)
            self.set_valid(True)

        return True

    def watch_pointed(self, event):
        '''
        Fired when changes made to pointed file
        '''
        self.update_pointed()

    def update_pointed(self):
        '''
        Grabs the latest file contents based on the pointer uri
        '''
        # only grab file if our pointer is still good (not None)
        if not self.pointed_at_expired:
            try:
                conf_string, stat2 = self.zoo_client.get(
                    self.point_path, watch=self.watch_pointed)
                conf_string = conf_string.decode('utf-8')
            except ZookeeperError:
                self.old_data = ''
                self.set_valid(False)
                self.pointed_at_expired = True
                self.call_error(self.INVALID_PATH)
                return

            if self.compare_data(conf_string):
                self.call_config(conf_string)
            self.set_valid(True)

    def set_valid(self, boolean):
        '''
        Sets the state and calls the change if needed
        @param bool: The state (true or false)
        '''
        old_state = self.is_valid()
        self.valid_file = boolean

        if old_state != self.valid_file:
            self.call_valid(self.valid_file)

    def call_valid(self, state):
        '''
        Calls the valid change function passed in
        @param valid_state: The new config
        '''
        if self.valid_handler is not None:
            self.valid_handler(self.is_valid())

    def call_config(self, new_config):
        '''
        Calls the config function passed in
        @param new_config: The new config
        '''
        if self.config_handler is not None:
            self.config_handler(new_config)

    def call_error(self, message):
        '''
        Calls the error function passed in
        @param message: The message to throw
        '''
        if self.error_handler is not None:
            self.error_handler(message)

    def compare_data(self, data):
        '''
        Compares the string data
        @return: True if the data is different
        '''
        if self.old_data != data:
            self.old_data = data
            return True
        return False

    def compare_pointer(self, data):
        '''
        Compares the string data
        @return: True if the data is different
        '''
        if self.old_pointed != data:
            self.old_pointed = data
            return True
        return False
# Create a zookeeper listener
def _my_listener(state):
    if state == KazooState.LOST:
        # Register somewhere that the session was lost
        logger.warning("Zookeeper session lost: {}".format(state))
    elif state == KazooState.SUSPENDED:
        # Handle being disconnected from Zookeeper
        logger.warning("Zookeeper session suspended: {}".format(state))
    else:
        # Handle being connected/reconnected to Zookeeper
        logger.info("Connected to zookeeper: {}".format(state))


# Connect to Zookeeper
try:
    logger.info("Connecting to zookeeper")
    zk = KazooClient(hosts='localhost:2181')
    zk.add_listener(_my_listener)
    zk.start()
except Exception as e:
    logger.error("Unable to start the connection to Zookeeper".format(e))

# Create the base config in Zookeeper
try:
    zk.create("/zktesting")
    zk.create("/zktesting/traptor")
except Exception as e:
    logger.error("Unable to create the base Traptor config")

zk.stop()
def connect(zk_quorum):
    logger.info('Connecting to zookeeper quorum at: {0}'.format(zk_quorum))
    zk = KazooClient(hosts=zk_quorum)
    zk.start()
    zk.add_listener(connection_lost)
    return zk
Esempio n. 41
0
class DeploymentConfig(object):
    """ Accesses deployment configuration options. """
    # The ZooKeeper node where configuration is stored.
    CONFIG_ROOT = '/appscale/config'

    def __init__(self, hosts):
        """ Creates new DeploymentConfig object.

    Args:
      hosts: A list of ZooKeeper hosts.
    """
        self.logger = logging.getLogger(self.__class__.__name__)
        self.update_lock = Lock()
        self.state = ConfigStates.LOADING
        self.config = {}
        self.conn = KazooClient(hosts=hosts, read_only=True)
        self.conn.add_listener(self._conn_listener)
        self.conn.start()
        self.conn.ensure_path(self.CONFIG_ROOT)
        self.conn.ChildrenWatch(self.CONFIG_ROOT, func=self._update_config)

    def _conn_listener(self, state):
        """ Handles changes in ZooKeeper connection state.

    Args:
      state: A string indicating the new state.
    """
        if state == KazooState.LOST:
            self.logger.warning('ZK connection lost')
        if state == KazooState.SUSPENDED:
            self.logger.warning('ZK connection suspended')
        else:
            self.logger.info('ZK connection established')

    def _load_child(self, child):
        """ Fetches the data for a configuration node.

    Args:
      child: A string containing the ZooKeeper node to fetch.
    Returns:
      A dictionary containing configuration data.
    Raises:
      InaccessibleConfig if ZooKeeper is not accessible.
    """
        node = '/'.join([self.CONFIG_ROOT, child])
        try:
            data, _ = self.conn.retry(self.conn.get, node)
        except (KazooException, ZookeeperError):
            raise ConfigInaccessible('ZooKeeper connection not available')
        except NoNodeError:
            return {}

        try:
            return json.loads(data)
        except ValueError:
            self.logger.warning('Invalid deployment config: {}'.format(child))
            return {}

    def _update_config(self, children):
        """ Updates configuration when it changes.

    Args:
      children: A list of ZooKeeper nodes.
    """
        with self.update_lock:
            self.state = ConfigStates.LOADING

            # Ensure old sections are removed.
            self.config = {}

            for child in children:
                while True:
                    try:
                        self.config[child] = self._load_child(child)
                        break
                    except ConfigInaccessible as load_error:
                        self.logger.warning(str(load_error))
                        time.sleep(SMALL_WAIT)

            self.logger.info('Deployment configuration updated')
            self.state = ConfigStates.LOADED

    def get_config(self, section):
        """ Fetches the configuration for a given section.

    Args:
      section: A string specifying the section to fetch.
    Returns:
      A dictionary containing configuration data.
    Raises:
      InaccessibleConfig if ZooKeeper is inaccessible.
    """
        # If the connection is established, it should finish loading very soon.
        while (self.state == ConfigStates.LOADING and self.conn.state
               not in (KazooState.LOST, KazooState.SUSPENDED)):
            time.sleep(TINY_WAIT)

        if self.state != ConfigStates.LOADED:
            raise ConfigInaccessible('ZooKeeper connection not available')

        with self.update_lock:
            if section not in self.config:
                return {}
            return self.config[section]

    def close(self):
        """ Close the ZooKeeper connection. """
        self.conn.stop()
Esempio n. 42
0
class ZookeeperProxy(object):

    hook_points = ['kazoo_state_change']

    def __init__(self):
        self._zk = None
        self.logger = None
        self._root_path = None
        self._hooks = Hooks(ZookeeperProxy.hook_points)

    def listener(self, state):
        if state == KazooState.LOST:
            # Register somewhere that the session was lost
            self.logger.info("listener, KazooState.LOST")
        elif state == KazooState.SUSPENDED:
            # Handle being disconnected from Zookeeper
            self.logger.info("listener, KazooState.SUSPENDED")
        elif state == KazooState.CONNECTED:
            # Handle being connected/reconnected to Zookeeper
            self.logger.info("listener, KazooState.CONNECTED")
        else:
            self.logger.info("listener, KazooState unknown")

        self.hooks.run('kazoo_state_change', state)

    def connect(self, ip_address, port, root_path, logger):
        if not self._zk:
            # establish zookeeper connection
            self._zk = KazooClient(
                hosts='{0}:{1}'.format(ip_address, port, logger=logger))
            self._zk.start()
            self._zk.add_listener(self.listener)

            # Ensure a path, create if necessary
            self._root_path = root_path
            self._zk.ensure_path(self._root_path)

            self.logger = logger
            # make sure finalize is called when stopping nio
            atexit.register(self.disconnect)

    def disconnect(self):
        self.logger.info("Disconnecting")
        if self._zk:
            self._zk.stop()
            self._zk = None

    def get_children(self, node_path):
        try:
            children = self._zk.get_children(node_path)
            return children
        except NoNodeError:
            pass  # pragma: no cover

        return None

    def fetch(self, node_path):
        try:
            data, stat = self._zk.get(node_path)
            if data:
                data = json.loads(data.decode())
        except NoNodeError:
            data = {}  # pragma: no cover
        return data

    def register(self, node_path, config):
        serialized_config = self._process_for_serialization(config)
        try:
            self._zk.create(node_path, serialized_config)
        except NodeExistsError:
            self._zk.set(node_path, serialized_config)

    def save(self, node_path, config):
        self._zk.set(node_path, self._process_for_serialization(config))

    def remove(self, node_path):
        self._zk.delete(node_path, recursive=True)

    @staticmethod
    def _process_for_serialization(config):
        data = {k: config[k] for k in config if not k.startswith('_')}
        return json.dumps(data).encode()

    def get_root_path(self):
        return self._root_path

    @property
    def hooks(self):
        return self._hooks
Esempio n. 43
0
class ZkConfig:
    def __init__(self):
        self.zk = KazooClient(hosts=zk_hosts)
        try:
            self.zk.start()
        except KazooTimeoutError as e:
            exit(e.args)
        finally:
            self.zk.add_listener(connection_listener)

    def _get_config(func):
        def wrapper(self,
                    node_path_env: str,
                    node_path_conf: str,
                    conf_dict: dict = {}):
            stat = self.zk.exists(node_path_conf)
            if stat is None:
                err_msg = "%s is not exists".format(node_path_conf)
                logger.error(err_msg)
                raise DpException(
                    ErrorConstants.ec_sys_error,
                    ErrorConstants.error_code_message.get(
                        ErrorConstants.ec_sys_error) + err_msg)

            @self.zk.ChildrenWatch(node_path_env)
            def watch_children(children):
                logger.warning("Children of %s are now: %s\n", node_path_env,
                               children)

            @self.zk.DataWatch(node_path_conf)
            def watch_node(data, stat, event: WatchedEvent):
                logger.warning("Version: %s, data: %s, event is %s\n",
                               stat.version, data.decode("utf-8"), event)

            data = self.zk.get(node_path_conf)
            conf_data = data[0].decode("utf-8")
            try:
                conf_dict = json.loads(conf_data)
            except:
                err_msg = "can't convert conf data to dict, conf_data is %s".format(
                    conf_data)
                logger.error(err_msg)
                raise DpException(
                    ErrorConstants.ec_sys_error,
                    ErrorConstants.error_code_message.get(
                        ErrorConstants.ec_sys_error) + err_msg)

            func(self, node_path_conf, node_path_env, conf_dict)

        return wrapper

    @_get_config
    def get_db_config(self,
                      node_path_env: str,
                      node_path_conf: str,
                      conf_dict: dict = {}):
        test_dict = {}
        test_dict["host"] = "a"
        test_dict.get("host")
        try:
            db_config.host = conf_dict.get("host")
            db_config.user = conf_dict.get("user")
            db_config.passwd = conf_dict.get("passwd")
            db_config.db = conf_dict.get("db")
            db_config.port = conf_dict.get("port")
        except:
            err_msg = "param of mysql config is deficiency: %s".format(
                conf_dict)
            logger.error(err_msg)
            raise DpException(
                ErrorConstants.ec_sys_error,
                ErrorConstants.error_code_message.get(
                    ErrorConstants.ec_sys_error) + err_msg)
        else:
            logger.debug(db_config)
Esempio n. 44
0
class PartitionClient(object):
    """ Client Class for the Partition Library
    Example usage:
    ---------------------
    import libpartition
    from libpartition.libpartition import PartitionClient

    def own_change_cb(l):
            print "ownership change:" + str(l)

    c = PartitionClient("test", "s1", ["s1", "s2", "s3"], 32, 
            own_change_cb, "zookeeper_s1")

    ##do some real work now"
    if (c.own_partition(1)):
        ...... do something with partition #1 .....
        .........
    ...
    c.update_cluster_list(["s1", "s2"])
    ...
    ----------------------
    You should not call any partition library routine from within the 
    callback function

    Args:
        app_name(str): Name of the app for which partition cluster is used
        self_name(str): Name of the local cluster node (can be ip address)
        cluster_list(list): List of all the nodes in the cluster including 
            local node
        max_partition(int): Partition space always go from 0..max_partition-1
        partition_update_cb: Callback function invoked when partition
            ownership list is updated.x
        zk_server(str): <zookeeper server>:<zookeeper server port>
    """
    def __init__(self,
                 app_name,
                 self_name,
                 cluster_list,
                 max_partition,
                 partition_update_cb,
                 zk_server,
                 logger=None):

        # Initialize local variables
        self._zk_server = zk_server
        self._cluster_list = set(cluster_list)
        self._max_partition = max_partition
        self._update_cb = partition_update_cb
        self._curr_part_ownership_list = []
        self._target_part_ownership_list = []
        self._con_hash = ConsistentHash(cluster_list)
        self._name = self_name

        # some sanity check
        if not (self._name in cluster_list):
            raise ValueError('cluster list is missing local server name')

        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')

        # connect to zookeeper
        while True:
            self._logger.error("Libpartition zk start")
            self._zk = KazooClient(zk_server)
            self._zk.add_listener(self._zk_listen)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_listen)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in Libpartition zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._name))
                finally:
                    self._zk = None
                gevent.sleep(1)

        # create a lock array to contain locks for each partition
        self._part_locks = []
        for part in range(0, self._max_partition):
            lockpath = "/lockpath/" + app_name + "/" + str(part)
            l = self._zk.Lock(lockpath, self._name)
            self._part_locks.append(l)

        # initialize partition # to lock acquire greenlet dictionary
        self._part_lock_task_dict = {}

        self._logger.error("initial servers:" + str(self._cluster_list))

        # update target partition ownership list
        for part in range(0, self._max_partition):
            if (self._con_hash.get_node(str(part)) == self._name):
                self._target_part_ownership_list.append(part)

        # update current ownership list
        self._acquire_partition_ownership()

    #end __init__

    def _sandesh_connection_info_update(self, status, message):
        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER,
                               name='Zookeeper',
                               status=new_conn_state,
                               message=message,
                               server_addrs=self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN
                and new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' % (message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state
                and new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state

    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Libpartition listen %s" % str(state))
        if state == KazooState.CONNECTED:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
        elif state == KazooState.LOST:
            self._logger.error("Libpartition connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all
            # over again
            self._sandesh_connection_info_update(
                status='DOWN', message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Libpartition connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(
                status='INIT',
                message='Connection to zookeeper lost. Retrying')

    # following routine is the greenlet task function to acquire the lock
    # for a partition
    def _acquire_lock(self, part):
        # lock for the partition
        l = self._part_locks[part]

        # go in an infinite loop waiting to acquire the lock
        try:
            while True:
                ret = l.acquire(blocking=False)
                if ret == True:
                    self._logger.error("Acquired lock for:" + str(part))
                    self._curr_part_ownership_list.append(part)
                    self._update_cb(self._curr_part_ownership_list)
                    return True
                else:
                    gevent.sleep(1)
        except CancelledError:
            self._logger.error("Lock acquire cancelled for:" + str(part))
            return False
        except Exception as ex:
            # TODO: If we have a non-KazooException, the lock object
            #       may get stuck in the "cancelled" state
            self._logger.error("Lock acquire unexpected error!: " + str(ex))
            # This exception should get propogated to main thread
            raise SystemExit(1)
            return False

    #end _acquire_lock

    # get rid of finished spawned tasks from datastructures
    def _cleanup_greenlets(self):
        for part in self._part_lock_task_dict.keys():
            if (self._part_lock_task_dict[part].ready()):
                del self._part_lock_task_dict[part]

    #end _cleanup_greenlets

    # following routine launches tasks to acquire partition locks
    def _acquire_partition_ownership(self):
        # cleanup any finished greenlets
        self._cleanup_greenlets()

        # this variable will help us decide if we need to call callback
        updated_curr_ownership = False

        # list of partitions for which locks have to be released
        release_lock_list = []

        self._logger.info("known servers: %s" % self._con_hash.get_all_nodes())

        for part in range(0, self._max_partition):
            if (part in self._target_part_ownership_list):
                if (part in self._curr_part_ownership_list):
                    # do nothing, I already have ownership of this partition
                    self._logger.info("No need to acquire ownership of:" +
                                      str(part))
                else:
                    # I need to acquire lock for this partition before I own
                    if (part in self._part_lock_task_dict.keys()):
                        try:
                            self._part_lock_task_dict[part].get(block=False)
                        except:
                            # do nothing there is already a greenlet running to
                            # acquire the lock
                            self._logger.error("Already a greenlet running to"
                                               " acquire:" + str(part))
                            continue

                        # Greenlet died without getting ownership. Cleanup
                        self._logger.error("Cleanup stale greenlet running to"
                                           " acquire:" + str(part))
                        del self._part_lock_task_dict[part]

                    self._logger.error("Starting greenlet running to"
                                       " acquire:" + str(part))
                    # launch the greenlet to acquire the loc, k
                    g = Greenlet.spawn(self._acquire_lock, part)
                    self._part_lock_task_dict[part] = g

            else:
                # give up ownership of the partition

                # cancel any lock acquisition which is ongoing
                if (part in self._part_lock_task_dict.keys()):
                    try:
                        self._part_lock_task_dict[part].get(block=False)
                    except:

                        self._logger.error(
                            "canceling lock acquisition going on \
                            for:" + str(part))
                        # Cancelling the lock should result in killing the gevent
                        self._part_locks[part].cancel()
                        self._part_lock_task_dict[part].get(block=True)

                    del self._part_lock_task_dict[part]

                if (part in self._curr_part_ownership_list):
                    release_lock_list.append(part)
                    self._curr_part_ownership_list.remove(part)
                    updated_curr_ownership = True
                    self._logger.error("giving up ownership of:" + str(part))

        if (updated_curr_ownership is True):
            # current partition membership was updated call the callback
            self._update_cb(self._curr_part_ownership_list)

        if (len(release_lock_list) != 0):
            # release locks which were acquired
            for part in release_lock_list:
                self._logger.error("release the lock which was acquired:" + \
                        str(part))
                try:
                    self._part_locks[part].release()
                    self._logger.error("fully gave up ownership of:" +
                                       str(part))
                except:
                    pass

    #end _acquire_partition_ownership

    def update_cluster_list(self, cluster_list):
        """ Updates the cluster node list
        Args:
            cluster_list(list): New list of names of the nodes in 
                the cluster
        Returns:
            None
        """
        # some sanity check
        if not (self._name in cluster_list):
            raise ValueError('cluster list is missing local server name')

        new_cluster_list = set(cluster_list)
        new_servers = list(new_cluster_list.difference(self._cluster_list))
        deleted_servers = list(
            set(self._cluster_list).difference(new_cluster_list))
        self._cluster_list = set(cluster_list)

        # update the hash structure
        if new_servers:
            self._logger.error("new servers:" + str(new_servers))
            self._con_hash.add_nodes(new_servers)
        if deleted_servers:
            self._logger.error("deleted servers:" + str(deleted_servers))
            self._con_hash.del_nodes(deleted_servers)

        # update target partition ownership list
        self._target_part_ownership_list = []
        for part in range(0, self._max_partition):
            if (self._con_hash.get_node(str(part)) == self._name):
                if not (part in self._target_part_ownership_list):
                    self._target_part_ownership_list.append(part)

        # update current ownership list
        self._acquire_partition_ownership()

    #end update_cluster_list

    def own_partition(self, part_no):
        """ Returns ownership information of a partition
        Args:
            part_no(int) : Partition no 
        Returns:
            True if partition is owned by the local node
            False if partition is not owned by the local node
        """
        return part_no in self._curr_part_ownership_list

    #end own_partition

    def close(self):
        """ Closes any connections and frees up any data structures
        Args:
        Returns:
            None
        """
        # clean up greenlets
        for part in self._part_lock_task_dict.keys():
            try:
                self._logger.error("libpartition greenlet cleanup %s" %
                                   str(part))
                self._part_lock_task_dict[part].kill()
            except:
                pass

        self._zk.remove_listener(self._zk_listen)
        gevent.sleep(1)
        self._logger.error("Stopping libpartition")
        # close zookeeper
        try:
            self._zk.stop()
        except:
            self._logger.error("Stopping libpartition failed")
        else:
            self._logger.error("Stopping libpartition successful")

        self._logger.error("Closing libpartition")
        try:
            self._zk.close()
        except:
            self._logger.error("Closing libpartition failed")
        else:
            self._logger.error("Closing libpartition successful")
Esempio n. 45
0
PUT_SUCCESS = 0
PUT_ERROR = -1
PUT_PROP_SUCCESS = 0
PUT_PROP_ERROR = -1
DELETE_SUCCESS = 0
DELETE_ERROR = -1
DELETE_PROP_SUCCESS = 0
DELETE_PROP_ERROR = -1
DUMP_SUCCESS = 0
DUMP_ERROR = -1

zk_host = "127.0.0.1"
zk_port = 2181
zk = KazooClient(hosts=(zk_host + ":" + str(zk_port)))
zk.start()
zk.add_listener(zk_state_listener)

host = ""
port = -1
GroupId = -1
ServerId = -1
peer_infos = []
group_infos = {}
model = None
hash_table = None


class serverRPC:
    def get(self, key):
        try:
            return model.get(key)
Esempio n. 46
0
File: zk.py Progetto: zhucc/zuul
class ZooKeeper(object):
    '''
    Class implementing the ZooKeeper interface.

    This class uses the facade design pattern to keep common interaction
    with the ZooKeeper API simple and consistent for the caller, and
    limits coupling between objects. It allows for more complex interactions
    by providing direct access to the client connection when needed (though
    that is discouraged). It also provides for a convenient entry point for
    testing only ZooKeeper interactions.
    '''

    log = logging.getLogger("zuul.zk.ZooKeeper")

    REQUEST_ROOT = '/nodepool/requests'
    NODE_ROOT = '/nodepool/nodes'

    # Log zookeeper retry every 10 seconds
    retry_log_rate = 10

    def __init__(self):
        '''
        Initialize the ZooKeeper object.
        '''
        self.client = None
        self._became_lost = False
        self._last_retry_log = 0

    def _dictToStr(self, data):
        return json.dumps(data).encode('utf8')

    def _strToDict(self, data):
        return json.loads(data.decode('utf8'))

    def _connection_listener(self, state):
        '''
        Listener method for Kazoo connection state changes.

        .. warning:: This method must not block.
        '''
        if state == KazooState.LOST:
            self.log.debug("ZooKeeper connection: LOST")
            self._became_lost = True
        elif state == KazooState.SUSPENDED:
            self.log.debug("ZooKeeper connection: SUSPENDED")
        else:
            self.log.debug("ZooKeeper connection: CONNECTED")

    @property
    def connected(self):
        return self.client.state == KazooState.CONNECTED

    @property
    def suspended(self):
        return self.client.state == KazooState.SUSPENDED

    @property
    def lost(self):
        return self.client.state == KazooState.LOST

    @property
    def didLoseConnection(self):
        return self._became_lost

    def resetLostFlag(self):
        self._became_lost = False

    def logConnectionRetryEvent(self):
        now = time.monotonic()
        if now - self._last_retry_log >= self.retry_log_rate:
            self.log.warning("Retrying zookeeper connection")
            self._last_retry_log = now

    def connect(self, hosts, read_only=False, timeout=10.0):
        '''
        Establish a connection with ZooKeeper cluster.

        Convenience method if a pre-existing ZooKeeper connection is not
        supplied to the ZooKeeper object at instantiation time.

        :param str hosts: Comma-separated list of hosts to connect to (e.g.
            127.0.0.1:2181,127.0.0.1:2182,[::1]:2183).
        :param bool read_only: If True, establishes a read-only connection.
        :param float timeout: The ZooKeeper session timeout, in
            seconds (default: 10.0).
        '''
        if self.client is None:
            self.client = KazooClient(hosts=hosts,
                                      read_only=read_only,
                                      timeout=timeout)
            self.client.add_listener(self._connection_listener)
            # Manually retry initial connection attempt
            while True:
                try:
                    self.client.start(1)
                    break
                except KazooTimeoutError:
                    self.logConnectionRetryEvent()

    def disconnect(self):
        '''
        Close the ZooKeeper cluster connection.

        You should call this method if you used connect() to establish a
        cluster connection.
        '''
        if self.client is not None and self.client.connected:
            self.client.stop()
            self.client.close()
            self.client = None

    def resetHosts(self, hosts):
        '''
        Reset the ZooKeeper cluster connection host list.

        :param str hosts: Comma-separated list of hosts to connect to (e.g.
            127.0.0.1:2181,127.0.0.1:2182,[::1]:2183).
        '''
        if self.client is not None:
            self.client.set_hosts(hosts=hosts)

    def submitNodeRequest(self, node_request, watcher):
        '''
        Submit a request for nodes to Nodepool.

        :param NodeRequest node_request: A NodeRequest with the
            contents of the request.

        :param callable watcher: A callable object that will be
            invoked each time the request is updated.  It is called
            with two arguments: (node_request, deleted) where
            node_request is the same argument passed to this method,
            and deleted is a boolean which is True if the node no
            longer exists (notably, this will happen on disconnection
            from ZooKeeper).  The watcher should return False when
            further updates are no longer necessary.
        '''
        data = node_request.toDict()
        data['created_time'] = time.time()

        path = '%s/%s-' % (self.REQUEST_ROOT, node_request.priority)
        path = self.client.create(path,
                                  self._dictToStr(data),
                                  makepath=True,
                                  sequence=True,
                                  ephemeral=True)
        reqid = path.split("/")[-1]
        node_request.id = reqid

        def callback(data, stat):
            if data:
                data = self._strToDict(data)
                request_nodes = list(node_request.nodeset.getNodes())
                for i, nodeid in enumerate(data.get('nodes', [])):
                    node_path = '%s/%s' % (self.NODE_ROOT, nodeid)
                    node_data, node_stat = self.client.get(node_path)
                    node_data = self._strToDict(node_data)
                    request_nodes[i].id = nodeid
                    request_nodes[i].updateFromDict(node_data)
                node_request.updateFromDict(data)
            deleted = (data is None)  # data *are* none
            return watcher(node_request, deleted)

        self.client.DataWatch(path, callback)

    def deleteNodeRequest(self, node_request):
        '''
        Delete a request for nodes.

        :param NodeRequest node_request: A NodeRequest with the
            contents of the request.
        '''

        path = '%s/%s' % (self.REQUEST_ROOT, node_request.id)
        try:
            self.client.delete(path)
        except kze.NoNodeError:
            pass

    def nodeRequestExists(self, node_request):
        '''
        See if a NodeRequest exists in ZooKeeper.

        :param NodeRequest node_request: A NodeRequest to verify.

        :returns: True if the request exists, False otherwise.
        '''
        path = '%s/%s' % (self.REQUEST_ROOT, node_request.id)
        if self.client.exists(path):
            return True
        return False

    def storeNode(self, node):
        '''Store the node.

        The node is expected to already exist and is updated in its
        entirety.

        :param Node node: The node to update.
        '''

        path = '%s/%s' % (self.NODE_ROOT, node.id)
        self.client.set(path, self._dictToStr(node.toDict()))

    def lockNode(self, node, blocking=True, timeout=None):
        '''
        Lock a node.

        This should be called as soon as a request is fulfilled and
        the lock held for as long as the node is in-use.  It can be
        used by nodepool to detect if Zuul has gone offline and the
        node should be reclaimed.

        :param Node node: The node which should be locked.
        '''

        lock_path = '%s/%s/lock' % (self.NODE_ROOT, node.id)
        try:
            lock = Lock(self.client, lock_path)
            have_lock = lock.acquire(blocking, timeout)
        except kze.LockTimeout:
            raise LockException("Timeout trying to acquire lock %s" %
                                lock_path)

        # If we aren't blocking, it's possible we didn't get the lock
        # because someone else has it.
        if not have_lock:
            raise LockException("Did not get lock on %s" % lock_path)

        node.lock = lock

    def unlockNode(self, node):
        '''
        Unlock a node.

        The node must already have been locked.

        :param Node node: The node which should be unlocked.
        '''

        if node.lock is None:
            raise LockException("Node %s does not hold a lock" % (node, ))
        node.lock.release()
        node.lock = None

    def heldNodeCount(self, autohold_key):
        '''
        Count the number of nodes being held for the given tenant/project/job.

        :param set autohold_key: A set with the tenant/project/job names.
        '''
        identifier = " ".join(autohold_key)
        try:
            nodes = self.client.get_children(self.NODE_ROOT)
        except kze.NoNodeError:
            return 0

        count = 0
        for nodeid in nodes:
            node_path = '%s/%s' % (self.NODE_ROOT, nodeid)
            node_data, node_stat = self.client.get(node_path)
            if not node_data:
                self.log.warning("Node ID %s has no data", nodeid)
                continue
            node_data = self._strToDict(node_data)
            if (node_data['state'] == zuul.model.STATE_HOLD
                    and node_data.get('hold_job') == identifier):
                count += 1
        return count
Esempio n. 47
0
class ZookeeperServiceRegistry(BaseServiceRegistry):
    def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT):
        super(ZookeeperServiceRegistry, self).__init__()
        self.chroot = chroot
        self.client = KazooClient(
            hosts=hosts,
            handler=SequentialGeventHandler(),
        )
        self.client.add_listener(self.on_kazoo_state_change)
        self.start_count = 0

    @classmethod
    def from_config(cls, config, **kwargs):
        return cls(hosts=config.get('hosts', DEFAULT_HOSTS),
                   chroot=config.get('chroot', DEFAULT_CHROOT),
                   **kwargs)

    def on_start(self, timeout=10):
        self.start_count += 1
        if self.start_count > 1:
            return
        started = self.client.start_async()
        started.wait(timeout=timeout)
        if not self.client.connected:
            raise RuntimeError('could not connect to zookeeper')
        logger.debug('connected to zookeeper (version=%s)',
                     '.'.join(map(str, self.client.server_version())))

    def on_stop(self):
        self.start_count -= 1
        if self.start_count != 0:
            return
        self.client.stop()

    def on_kazoo_state_change(self, state):
        logger.info('kazoo connection state changed to %s', state)

    def on_service_type_watch(self, service, event):
        try:
            if event.type == EventType.CHILD:
                # FIXME: figure out proper retry strategy
                self.client.retry(self.lookup, service.container, service)
        except Exception:
            logger.exception('error in service type watcher')

    def on_service_watch(self, service, event):
        try:
            prefix, service_type, identity = event.path.rsplit('/', 2)
            if event.type == EventType.DELETED:
                service.remove(identity)
        except Exception:
            logger.exception('error in service watcher')

    def _get_service_znode(self, service, service_type, identity):
        path = self._get_zk_path(service_type, identity)
        result = self.client.get_async(path,
                                       watch=functools.partial(
                                           self.on_service_watch, service))
        value, znode = result.get()
        items = six.iteritems(json.loads(value.decode('utf-8')))
        return {str(k): str(v) for k, v in items}

    def discover(self, container):
        result = self.client.get_children_async(path='%s/services' %
                                                self.chroot, )
        return list(result.get())

    def lookup(self, container, service, watch=True, timeout=1):
        def child_watch(event):
            print(event)

        service_type = service.service_type
        result = self.client.get_children_async(
            path='%s/services/%s' % (self.chroot, service_type),
            watch=functools.partial(self.on_service_type_watch, service),
        )
        try:
            names = result.get(timeout=timeout)
        except NoNodeError:
            raise LookupFailure(None,
                                "failed to resolve %s" % service.service_type)
        logger.info("lookup %s %r", service_type, names)
        identities = set(service.identities())
        for name in names:
            kwargs = self._get_service_znode(service, service_type, name)
            identity = kwargs.pop('identity')
            service.update(identity, **kwargs)
            try:
                identities.remove(identity)
            except KeyError:
                pass
        for identity in identities:
            service.remove(identity)
        return service

    def _get_zk_path(self, service_type, identity):
        return '%s/services/%s/%s' % (self.chroot, service_type, identity)

    def register(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        value = json.dumps({
            'endpoint': container.endpoint,
            'identity': container.identity,
            'log_endpoint': container.log_endpoint,
        })
        result = self.client.create_async(path,
                                          value.encode('utf-8'),
                                          ephemeral=True,
                                          makepath=True)
        # FIXME: result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)

    def unregister(self, container, service_type, timeout=1):
        path = self._get_zk_path(service_type, container.identity)
        result = self.client.delete_async(path)
        result.set_exception(RegistrationFailure())
        result.get(timeout=timeout)
Esempio n. 48
0
class ZookeeperRegistry(Registry):
    _app_config = ApplicationConfig('default_app')
    _connect_state = 'UNCONNECT'

    def __init__(self, zk_hosts, application_config=None):
        if application_config:
            self._app_config = application_config
        self.__zk = KazooClient(hosts=zk_hosts)
        self.__zk.add_listener(self.__state_listener)
        self.__zk.start()

    def __state_listener(self, state):
        if state == KazooState.LOST:
            # Register somewhere that the session was lost
            self._connect_state = state
        elif state == KazooState.SUSPENDED:
            # Handle being disconnected from Zookeeper
            # print 'disconnect from zookeeper'
            self._connect_state = state
        else:
            # Handle being connected/reconnected to Zookeeper
            # print 'connected'
            self._connect_state = state

    def __unquote(self, origin_nodes):
        return (urllib.parse.unquote(child_node) for child_node in origin_nodes
                if child_node)  #decode('utf8')

    def _do_event(self, event):
        # event.path 是类似/dubbo/com.ofpay.demo.api.UserProvider/providers 这样的
        # 如果要删除,必须先把/dubbo/和最后的/providers去掉
        # 将zookeeper中查询到的服务节点列表加入到一个dict中
        # zookeeper中保持的节点url类似如下
        provide_name = event.path[7:event.path.rfind('/')]
        if event.state == 'CONNECTED':
            children = self.__zk.get_children(event.path,
                                              watch=self.event_listener)
            self._compare_swap_nodes(provide_name, self.__unquote(children))
        if event.state == 'DELETED':
            children = self.__zk.get_children(event.path,
                                              watch=self.event_listener)
            self._compare_swap_nodes(provide_name, self.__unquote(children))

    def register(self, interface, **kwargs):
        ip = self.__zk._connection._socket.getsockname()[0]
        params = {
            'interface': interface,
            'application': self._app_config.name,
            'application.version': self._app_config.version,
            'category': 'consumer',
            'dubbo': 'dubbo-client-py-1.0.0',
            'environment': self._app_config.environment,
            'method': '',
            'owner': self._app_config.owner,
            'side': 'consumer',
            'pid': os.getpid(),
            'version': '1.0'
        }
        url = 'consumer://{0}/{1}?{2}'.format(ip, interface,
                                              urllib.parse.urlencode(params))
        # print urllib.quote(url, safe='')

        consumer_path = '{0}/{1}/{2}'.format('dubbo', interface, 'consumers')
        self.__zk.ensure_path(consumer_path)

        if not self.__zk.exists(consumer_path + '/' +
                                urllib.parse.quote(url, safe='')):
            self.__zk.create(consumer_path + '/' +
                             urllib.parse.quote(url, safe=''),
                             ephemeral=True)

    def subscribe(self, interface, **kwargs):
        """
        监听注册中心的服务上下线
        :param interface: 类似com.ofpay.demo.api.UserProvider这样的服务名
        :return: 无返回
        """
        version = kwargs.get('version', '')
        group = kwargs.get('group', '')
        children = self.__zk.get_children('{0}/{1}/{2}'.format(
            'dubbo', interface, 'providers'),
                                          watch=self.event_listener)
        # 全部重新添加
        self._compare_swap_nodes(interface, self.__unquote(children))
Esempio n. 49
0
class BaseZKgRPC():

    ZK_ENDPOINT = '127.0.0.1:2181'
    CA_FILE = False
    CLIENT_CERT = False
    CLIENT_KEY = False

    def __init__(self):
        """Constructor"""

        # immediately connect to zookeeper
        self.zk = KazooClient(hosts=self.ZK_ENDPOINT, read_only=True)
        self.zk.start()
        self.zk_connected = True

        # add state change listener to monitor zk connection events
        self.zk.add_listener(self.kazoo_listener)

        # register deconstructor to run on exit
        atexit.register(self.__del__)

        self.channel = False
        self.stub = False

        # determine stub class and import
        stub_package, stub_class = self.STUB_CLASS.rsplit('.', 1)
        self.stub_class = getattr(importlib.import_module(stub_package),
                                  stub_class)

        # do the same for all methods and generate instance methods
        for method_name, request_class_package in self.METHODS:
            request_package, request_class_name = request_class_package.rsplit(
                '.', 1)
            request_class = getattr(importlib.import_module(request_package),
                                    request_class_name)
            # create method to call lambda which passes info to generic method
            setattr(
                self,
                method_name,
                lambda method_name=method_name, request_class=request_class, **
                v: self.call_method(method_name, request_class, **v))

    def kazoo_listener(self, state):
        """Zookeeper state change handler monitors connection and flags availability"""

        if state == KazooState.LOST:
            LOGGER.info("Kazoo session lost")
            self.zk_connected = False
        elif state == KazooState.SUSPENDED:
            LOGGER.info("Kazoo disconnected")
            self.zk_connected = False
        else:
            LOGGER.info("Kazoo connected")
            self.zk_connected = True

    def connect(self):
        """Connect to gRPC endpoint based on TLS config"""

        endpoint = self.get_endpoint()
        LOGGER.info('Using endpoint: {}'.format(endpoint))

        # make TLS connection if given a root certificate
        if self.CA_FILE:
            self.channel = grpc.secure_channel(self.get_endpoint(),
                                               self.get_credentials())
        else:
            self.channel = grpc.insecure_channel(self.get_endpoint())
        self.stub = self.stub_class(self.channel)

    def get_endpoint(self):
        """Queries Zookeeper for a random available gRPC endpoint"""

        hosts = []
        # loop until at least one host is returned
        while len(hosts) < 1:
            try:
                # wait until zookeeper is flagged as available
                while not self.zk_connected:
                    LOGGER.info("Waiting for Zookeeper connection")
                    time.sleep(1)

                # iterate keys within the root
                brokers = [
                    json.loads(
                        self.zk.get('{}/{}'.format(self.ZK_KEY, node))[0])
                    for node in self.zk.get_children(self.ZK_KEY)
                ]

                # build endpoints from returned json
                hosts = ['%s:%d' % (b['host'], b['port']) for b in brokers]

                # wait until at least one host is returned
                if len(hosts) == 0:
                    LOGGER.info(
                        "Waiting for hosts to be available in {}".format(
                            self.ZK_KEY))
                    time.sleep(1)
            # handle connection issues and try again
            except NoNodeError:
                time.sleep(1)
            except ConnectionLoss:
                time.sleep(1)

        # choose a random host
        return random.choice(hosts)

    def get_credentials(self):
        """Creates TLS credentials"""

        with open(self.CA_FILE, 'rb') as f:
            ca_trust = f.read()
        return grpc.ssl_channel_credentials(root_certificates=ca_trust)

    def call_method(self, method_name, request_class, *args, **kwargs):
        """Generic method to build then send gRPC request"""

        # ensure we're connected
        if not self.channel or not self.stub:
            self.connect()

        # create a valid request message
        request = request_class(**kwargs)

        # reconnect automatically
        while True:
            try:
                # make the call
                response = getattr(self.stub, method_name)(request)
                return response
            except grpc._channel._Rendezvous as ex:
                # handle events that can be reconnected
                if ex.code() == grpc.StatusCode.UNAVAILABLE or ex.code(
                ) == grpc.StatusCode.INTERNAL:
                    # reconnect
                    LOGGER.info('Reconnecting...')
                    self.connect()
                else:
                    raise ex

    def __del__(self):
        """Ends zookeeper session and closes connection"""

        self.zk.stop()
        self.zk.close()
Esempio n. 50
0
    while success != True:
        try:
            redis_connection.slaveof(host=tHost, port=tPort)
            success = True
        except redis.ConnectionError:
            if timeout <= 0:
                raise OSError, "Timeout reached. Couldn't connect to Redis."
            print "Can't connect to Redis. Sleeping for %d seconds..." % retry_time
            sys.stdout.flush()
            timeout += retry_time
            sleep(retry_time)


def touch(fname, times=None):
    with open(fname, 'a'):
        os.utime(fname, times)


signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

print "Redis %s %s" % (redis_host, redis_port)
sys.stdout.flush()
redis_connection = redis.StrictRedis(host=redis_host, port=redis_port, db=0)

zk_connection = KazooClient(hosts=zk_hosts)
zk_connection.add_listener(zookeeper_listener)
zk_connection.start()

start_election_and_take_position()
class AnalyticsDiscovery(gevent.Greenlet):

    def _sandesh_connection_info_update(self, status, message):

        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type = ConnectionType.ZOOKEEPER,
                name = self._svc_name, status = new_conn_state,
                message = message,
                server_addrs = self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and
                new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' %(message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state and
                new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state
        #import pdb; pdb.set_trace()
    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Analytics Discovery listen %s" % str(state))
        if state == KazooState.CONNECTED:
            if self._conn_state != ConnectionStatus.UP:
                self._sandesh_connection_info_update(status='UP', message='')
                self._logger.error("Analytics Discovery to publish %s" % str(self._pubinfo))
                self._reconnect = True
            else:
                self._logger.error("Analytics Discovery already connected")
        else:
            self._logger.error("Analytics Discovery NOT connected")
            if self._conn_state == ConnectionStatus.UP:
                self._sandesh_connection_info_update(status='DOWN', message='')

    def _zk_datawatch(self, watcher, child, data, stat, event="unknown"):
        self._logger.error(\
                "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                (watcher, child, data, event))
        if data:
            data_dict = json.loads(data)
            self._wchildren[watcher][child] = OrderedDict(sorted(data_dict.items()))
        else:
            if child in self._wchildren[watcher]:
                del self._wchildren[watcher][child]
        if self._watchers[watcher]:
            self._pendingcb.add(watcher)

    def _zk_watcher(self, watcher, children):
        self._logger.error("Analytics Discovery Children %s" % children)
        self._reconnect = True

    def __init__(self, logger, zkservers, svc_name, inst,
                watchers={}, zpostfix="", freq=10):
        gevent.Greenlet.__init__(self)
        self._svc_name = svc_name
        self._inst = inst
        self._zk_server = zkservers
        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')
        self._zk = KazooClient(hosts=zkservers)
        self._pubinfo = None
        self._watchers = watchers
        self._wchildren = {}
        self._pendingcb = set()
        self._zpostfix = zpostfix
        self._basepath = "/analytics-discovery-" + self._zpostfix
        self._reconnect = None
        self._freq = freq

    def publish(self, pubinfo):
        self._pubinfo = pubinfo
        #import pdb; pdb.set_trace()
        if self._conn_state == ConnectionStatus.UP:
            try:
                self._logger.error("ensure %s" % (self._basepath + "/" + self._svc_name))
                self._logger.error("zk state %s (%s)" % (self._zk.state, self._zk.client_state))
                self._zk.ensure_path(self._basepath + "/" + self._svc_name)
                self._logger.error("check for %s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))
                if pubinfo is not None:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._zk.set("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo)
                    else:
                        self._zk.create("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo, ephemeral=True)
                else:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._logger.error("withdrawing published info!")
                        self._zk.delete("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))

            except Exception as ex:
                template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                self._sandesh_connection_info_update(status='DOWN', message='')
                self._reconnect = True
        else:
            self._logger.error("Analytics Discovery cannot publish while down")

    def _run(self):
        while True:
            try:
                self._zk.start()
                break
            except gevent.event.Timeout as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                         message=str(e))
                gevent.sleep(1)
                # Zookeeper is also throwing exception due to delay in master election
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                gevent.sleep(1)

        try:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
            self._reconnect = False
            # Done connecting to ZooKeeper

            self._zk.add_listener(self._zk_listen)
            for wk in self._watchers.keys():
                self._zk.ensure_path(self._basepath + "/" + wk)
                self._wchildren[wk] = {}
                self._zk.ChildrenWatch(self._basepath + "/" + wk,
                        partial(self._zk_watcher, wk))

            # Trigger the initial publish
            self._reconnect = True

            while True:
                try:
                    if not self._reconnect:
                        pending_list = list(self._pendingcb)
                        self._pendingcb = set()
                        for wk in pending_list:
                            if self._watchers[wk]:
                                self._watchers[wk](\
                                        sorted(self._wchildren[wk].values()))

                    # If a reconnect happens during processing, don't lose it
                    while self._reconnect:
                        self._logger.error("Analytics Discovery %s reconnect" \
                                % self._svc_name)
                        self._reconnect = False
                        self._pendingcb = set()
                        self.publish(self._pubinfo)

                        for wk in self._watchers.keys():
                            self._zk.ensure_path(self._basepath + "/" + wk)
                            children = self._zk.get_children(self._basepath + "/" + wk)

                            old_children = set(self._wchildren[wk].keys())
                            new_children = set(children)

                            # Remove contents for the children who are gone
                            # (DO NOT remove the watch)
                            for elem in old_children - new_children:
                                 del self._wchildren[wk][elem]

                            # Overwrite existing children, or create new ones
                            for elem in new_children:
                                # Create a watch for new children
                                if elem not in self._wchildren[wk]:
                                    self._zk.DataWatch(self._basepath + "/" + \
                                            wk + "/" + elem,
                                            partial(self._zk_datawatch, wk, elem))

                                data_str, _ = self._zk.get(\
                                        self._basepath + "/" + wk + "/" + elem)
                                data_dict = json.loads(data_str)
                                self._wchildren[wk][elem] = \
                                        OrderedDict(sorted(data_dict.items()))

                                self._logger.error(\
                                    "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                                    (wk, elem, self._wchildren[wk][elem], "GET"))
                            if self._watchers[wk]:
                                self._watchers[wk](sorted(self._wchildren[wk].values()))

                    gevent.sleep(self._freq)
                except gevent.GreenletExit:
                    self._logger.error("Exiting AnalyticsDiscovery for %s" % \
                            self._svc_name)
                    self._zk.stop()
                    break

                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                    self._reconnect = True

        except Exception as ex:
            template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}"
            messag = template.format(type(ex).__name__, ex.args)
            self._logger.error("%s : traceback %s for %s info %s" % \
                    (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
            raise SystemExit
Esempio n. 52
0
class _ZookeeperProxy(object):
    def __init__(self, address_provider: AddressListProvider, prefix: str):
        self.address_provider = address_provider
        self.async_counter = WaitingCounter(limit=100)
        self.conn_str = None
        self.client = None
        self.prefix = prefix
        self.hosts_cache = SlowlyUpdatedCache(
            self.address_provider.get_latest_address,
            self._update_hosts,
            30,  # Refresh every 30 seconds
            3 * 60)  # Update only after 180 seconds of stability

    def _update_hosts(self, value):
        hosts, port = value
        if hosts:
            self.conn_str = ','.join(['{}:{}'.format(h, port)
                                      for h in hosts]) + self.prefix
            if self.client is None:
                self.client = KazooClient(hosts=self.conn_str,
                                          command_retry={
                                              'deadline': 10,
                                              'max_delay': 1,
                                              'max_tries': -1
                                          },
                                          connection_retry={
                                              'max_delay': 1,
                                              'max_tries': -1
                                          })
                self.client.add_listener(self.session_listener)
            else:
                self.client.stop()
                self.client.set_hosts(self.conn_str)
            self.client.start()

    def terminate(self):
        if self.client:
            self.client.stop()

    def session_listener(self, state):
        pass

    def get_conn_str(self):
        return self.conn_str

    def get(self, *params):
        self.hosts_cache.touch()
        return self.client.retry(self.client.get, *params)

    def get_async(self, *params):
        # Exhibitor is not polled here and it's totally fine!
        self.async_counter.increment()
        try:
            i_async = self.client.get_async(*params)
            i_async.rawlink(self._decrement)
            return i_async
        except Exception as e:
            self._decrement()
            raise e

    def _decrement(self, *args, **kwargs):
        self.async_counter.decrement()

    def set(self, *args, **kwargs):
        self.hosts_cache.touch()
        return self.client.retry(self.client.set, *args, **kwargs)

    def create(self, *args, **kwargs):
        self.hosts_cache.touch()
        return self.client.retry(self.client.create, *args, **kwargs)

    def delete(self, *args, **kwargs):
        self.hosts_cache.touch()
        try:
            return self.client.retry(self.client.delete, *args, **kwargs)
        except NoNodeError:
            pass

    def get_children(self, *params):
        self.hosts_cache.touch()
        try:
            return self.client.retry(self.client.get_children, *params)
        except NoNodeError:
            return []

    def take_lock(self, *args, **kwargs):
        while True:
            try:
                self.hosts_cache.touch()
                return self.client.Lock(*args, **kwargs)
            except Exception as e:
                _LOG.error('Failed to obtain lock for exhibitor, retrying',
                           exc_info=e)
Esempio n. 53
0
class ZooKeeper(object):
    # Constants used by the REST API:
    LIVE_NODES_ZKNODE = "/live_nodes"
    ALIASES = "/aliases.json"
    CLUSTER_STATE = "/clusterstate.json"
    COLLECTION_STATUS = "/collections"
    COLLECTION_STATE = "/collections/%s/state.json"
    SHARDS = "shards"
    REPLICAS = "replicas"
    STATE = "state"
    ACTIVE = "active"
    LEADER = "leader"
    BASE_URL = "base_url"
    TRUE = "true"
    FALSE = "false"
    COLLECTION = "collection"

    def __init__(self,
                 zkServerAddress,
                 timeout=15,
                 max_retries=-1,
                 kazoo_client=None):
        if KazooClient is None:
            logging.error(
                "ZooKeeper requires the `kazoo` library to be installed")
            raise RuntimeError

        self.collections = {}
        self.liveNodes = {}
        self.aliases = {}
        self.state = None

        if kazoo_client is None:
            self.zk = KazooClient(
                zkServerAddress,
                read_only=True,
                timeout=timeout,
                command_retry={"max_tries": max_retries},
                connection_retry={"max_tries": max_retries},
            )
        else:
            self.zk = kazoo_client

        self.zk.start()

        def connectionListener(state):
            if state == KazooState.LOST:
                self.state = state
            elif state == KazooState.SUSPENDED:
                self.state = state

        self.zk.add_listener(connectionListener)

        @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE)
        def watchClusterState(data, *args, **kwargs):
            if not data:
                logger.warning(
                    "No cluster state available: no collections defined?")
            else:
                self.collections = json.loads(data.decode("utf-8"))
                logger.info("Updated collections: %s", self.collections)

        @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE)
        def watchLiveNodes(children):
            self.liveNodes = children
            logger.info("Updated live nodes: %s", children)

        @self.zk.DataWatch(ZooKeeper.ALIASES)
        def watchAliases(data, stat):
            if data:
                json_data = json.loads(data.decode("utf-8"))
                if ZooKeeper.COLLECTION in json_data:
                    self.aliases = json_data[ZooKeeper.COLLECTION]
                else:
                    logger.warning(
                        "Expected to find %s in alias update %s",
                        ZooKeeper.COLLECTION,
                        json_data.keys(),
                    )
            else:
                self.aliases = None
            logger.info("Updated aliases: %s", self.aliases)

        def watchCollectionState(data, *args, **kwargs):
            if not data:
                logger.warning(
                    "No cluster state available: no collections defined?")
            else:
                self.collections.update(json.loads(data.decode("utf-8")))
                logger.info("Updated collections: %s", self.collections)

        @self.zk.ChildrenWatch(ZooKeeper.COLLECTION_STATUS)
        def watchCollectionStatus(children):
            logger.info("Updated collection: %s", children)
            for c in children:
                self.zk.DataWatch(self.COLLECTION_STATE % c,
                                  watchCollectionState)

    def getHosts(self, collname, only_leader=False, seen_aliases=None):
        if self.aliases and collname in self.aliases:
            return self.getAliasHosts(collname, only_leader, seen_aliases)

        hosts = []
        if collname not in self.collections:
            raise SolrError("Unknown collection: %s" % collname)
        collection = self.collections[collname]
        shards = collection[ZooKeeper.SHARDS]
        for shardname in shards.keys():
            shard = shards[shardname]
            if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE:
                replicas = shard[ZooKeeper.REPLICAS]
                for replicaname in replicas.keys():
                    replica = replicas[replicaname]

                    if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE:
                        if not only_leader or (replica.get(
                                ZooKeeper.LEADER, None) == ZooKeeper.TRUE):
                            base_url = replica[ZooKeeper.BASE_URL]
                            if base_url not in hosts:
                                hosts.append(base_url)
        return hosts

    def getAliasHosts(self, collname, only_leader, seen_aliases):
        if seen_aliases:
            if collname in seen_aliases:
                logger.warning("%s in circular alias definition - ignored",
                               collname)
                return []
        else:
            seen_aliases = []
        seen_aliases.append(collname)
        collections = self.aliases[collname].split(",")
        hosts = []
        for collection in collections:
            for host in self.getHosts(collection, only_leader, seen_aliases):
                if host not in hosts:
                    hosts.append(host)
        return hosts

    def getRandomURL(self, collname, only_leader=False):
        hosts = self.getHosts(collname, only_leader=only_leader)
        if not hosts:
            raise SolrError("ZooKeeper returned no active shards!")
        return "%s/%s" % (random.choice(hosts), collname)  # NOQA: B311

    def getLeaderURL(self, collname):
        return self.getRandomURL(collname, only_leader=True)
Esempio n. 54
0
class Coordinator(object):
    def __init__(self, zk_hosts, hostname, port, join_cluster):
        self.me = '%s:%s' % (hostname, port)
        self.is_leader = None
        self.followers = cycle([])
        self.follower_count = 0
        self.started_shutdown = False

        if join_cluster:
            read_only = False
        else:
            read_only = True

        self.zk = KazooClient(hosts=zk_hosts, handler=SequentialGeventHandler(), read_only=read_only)
        event = self.zk.start_async()
        event.wait(timeout=5)

        self.lock = self.zk.Lock(path='/iris/sender_leader', identifier=self.me)

        # Used to keep track of followers / senders present in cluster
        self.party = Party(client=self.zk, path='/iris/sender_nodes', identifier=self.me)

        if join_cluster:
            self.zk.add_listener(self.event_listener)
            self.party.join()

    def am_i_leader(self):
        return self.is_leader

    # Used for API to get the current leader
    def get_current_leader(self):
        try:
            contenders = self.lock.contenders()
        except kazoo.exceptions.KazooException:
            logger.exception('Failed getting contenders')
            return None

        if contenders:
            return self.address_to_tuple(contenders[0])
        else:
            return None

    # Used for API to get the current followers if leader can't be reached
    def get_current_followers(self):
        return [self.address_to_tuple(host) for host in self.party]

    def address_to_tuple(self, address):
        try:
            host, port = address.split(':')
            return host, int(port)
        except (IndexError, ValueError):
            logger.error('Failed getting address tuple from %s', address)
            return None

    def update_status(self):
        if self.started_shutdown:
            return

        if self.zk.state == KazooState.CONNECTED:
            if self.lock.is_acquired:
                self.is_leader = True
            else:
                try:
                    self.is_leader = self.lock.acquire(blocking=False, timeout=2)

                # This one is expected when we're recovering from ZK being down
                except kazoo.exceptions.CancelledError:
                    self.is_leader = False

                except kazoo.exceptions.LockTimeout:
                    self.is_leader = False
                    logger.exception('Failed trying to acquire lock (shouldn\'t happen as we\'re using nonblocking locks)')

                except kazoo.exceptions.KazooException:
                    self.is_leader = False
                    logger.exception('ZK problem while Failed trying to acquire lock')
        else:
            logger.error('ZK connection is in %s state', self.zk.state)
            self.is_leader = False

        if self.zk.state == KazooState.CONNECTED:

            if self.is_leader:
                followers = [self.address_to_tuple(host) for host in self.party if host != self.me]
                self.follower_count = len(followers)
                self.followers = cycle(followers)
            else:
                self.followers = cycle([])
                self.follower_count = 0

            # Keep us as part of the party, so the current leader sees us as a follower
            if not self.party.participating:
                try:
                    self.party.join()
                except kazoo.exceptions.KazooException:
                    logger.exception('ZK problem while trying to join party')
        else:
            self.followers = cycle([])
            self.follower_count = 0

    def update_forever(self):
        while True:
            if self.started_shutdown:
                return

            old_status = self.is_leader
            self.update_status()
            new_status = self.is_leader

            if old_status != new_status:
                log = logger.info
            else:
                log = logger.debug

            if self.is_leader:
                log('I am the leader sender')
            else:
                log('I am a follower sender')

            metrics.set('follower_instance_count', self.follower_count)
            metrics.set('is_leader_sender', int(self.is_leader is True))

            sleep(UPDATE_FREQUENCY)

    def leave_cluster(self):
        self.started_shutdown = True

        # cancel any attempts to acquire leader lock which could make us hang
        self.lock.cancel()

        if self.zk.state == KazooState.CONNECTED:
            if self.party and self.party.participating:
                logger.info('Leaving party')
                self.party.leave()
            if self.lock and self.lock.is_acquired:
                logger.info('Releasing lock')
                self.lock.release()

        # Make us not the leader
        self.is_leader = False

        # Avoid sending metrics that we are still the leader when we're not
        metrics.set('is_leader_sender', 0)

    def event_listener(self, state):
        if state == KazooState.LOST or state == KazooState.SUSPENDED:
            logger.info('ZK state transitioned to %s. Resetting leader status.', state)

            # cancel pending attempts to acquire lock which will break and leave
            # us in bad state
            self.lock.cancel()

            # make us try to re-acquire lock during next iteration when we're connected
            if self.lock.is_acquired:
                self.lock.is_acquired = False

            # make us try to rejoin the party during next iteration when we're connected
            if self.party.participating:
                self.party.participating = False

            # in the meantime we're not leader
            self.is_leader = None
Esempio n. 55
0
def open_connection():
    global zk
    zk = KazooClient(hosts=CONNECT_STRING, timeout=50)
    zk.add_listener(my_listener)
    zk.start(timeout=150)
Esempio n. 56
0
class ZooKeeper(object):
    # Constants used by the REST API:
    LIVE_NODES_ZKNODE = '/live_nodes'
    ALIASES = '/aliases.json'
    CLUSTER_STATE = '/clusterstate.json'
    SHARDS = 'shards'
    REPLICAS = 'replicas'
    STATE = 'state'
    ACTIVE = 'active'
    LEADER = 'leader'
    BASE_URL = 'base_url'
    TRUE = 'true'
    FALSE = 'false'
    COLLECTION = 'collection'

    def __init__(self,
                 zkServerAddress,
                 timeout=15,
                 max_retries=-1,
                 kazoo_client=None):
        if KazooClient is None:
            logging.error(
                'ZooKeeper requires the `kazoo` library to be installed')
            raise RuntimeError

        self.collections = {}
        self.liveNodes = {}
        self.aliases = {}
        self.state = None

        if kazoo_client is None:
            self.zk = KazooClient(zkServerAddress,
                                  read_only=True,
                                  timeout=timeout,
                                  command_retry={'max_tries': max_retries},
                                  connection_retry={'max_tries': max_retries})
        else:
            self.zk = kazoo_client

        self.zk.start()

        def connectionListener(state):
            if state == KazooState.LOST:
                self.state = state
            elif state == KazooState.SUSPENDED:
                self.state = state

        self.zk.add_listener(connectionListener)

        @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE)
        def watchClusterState(data, *args, **kwargs):
            if not data:
                LOG.warning(
                    "No cluster state available: no collections defined?")
            else:
                self.collections = json.loads(data.decode('utf-8'))
                LOG.info('Updated collections: %s', self.collections)

        @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE)
        def watchLiveNodes(children):
            self.liveNodes = children
            LOG.info("Updated live nodes: %s", children)

        @self.zk.DataWatch(ZooKeeper.ALIASES)
        def watchAliases(data, stat):
            if data:
                json_data = json.loads(data.decode('utf-8'))
                if ZooKeeper.COLLECTION in json_data:
                    self.aliases = json_data[ZooKeeper.COLLECTION]
                else:
                    LOG.warning('Expected to find %s in alias update %s',
                                ZooKeeper.COLLECTION, json_data.keys())
            else:
                self.aliases = None
            LOG.info("Updated aliases: %s", self.aliases)

    def getHosts(self, collname, only_leader=False, seen_aliases=None):
        if self.aliases and collname in self.aliases:
            return self.getAliasHosts(collname, only_leader, seen_aliases)

        hosts = []
        if collname not in self.collections:
            raise SolrError("Unknown collection: %s" % collname)
        collection = self.collections[collname]
        shards = collection[ZooKeeper.SHARDS]
        for shardname in shards.keys():
            shard = shards[shardname]
            if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE:
                replicas = shard[ZooKeeper.REPLICAS]
                for replicaname in replicas.keys():
                    replica = replicas[replicaname]

                    if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE:
                        if not only_leader or (replica.get(
                                ZooKeeper.LEADER, None) == ZooKeeper.TRUE):
                            base_url = replica[ZooKeeper.BASE_URL]
                            if base_url not in hosts:
                                hosts.append(base_url)
        return hosts

    def getAliasHosts(self, collname, only_leader, seen_aliases):
        if seen_aliases:
            if collname in seen_aliases:
                LOG.warn("%s in circular alias definition - ignored", collname)
                return []
        else:
            seen_aliases = []
        seen_aliases.append(collname)
        collections = self.aliases[collname].split(",")
        hosts = []
        for collection in collections:
            for host in self.getHosts(collection, only_leader, seen_aliases):
                if host not in hosts:
                    hosts.append(host)
        return hosts

    def getRandomURL(self, collname, only_leader=False):
        hosts = self.getHosts(collname, only_leader=only_leader)
        if not hosts:
            raise SolrError('ZooKeeper returned no active shards!')
        return '%s/%s' % (random.choice(hosts), collname)

    def getLeaderURL(self, collname):
        return self.getRandomURL(collname, only_leader=True)
Esempio n. 57
0
    return str(value.decode())


def to_response(string):
    return string.content.decode('utf-8').strip("\"")


def hash_function(string):
    return sum(map(ord, list(string)))


try:
    logging.basicConfig()
    zkr = KazooRetry(max_tries=-1)
    client = KazooClient(hosts="127.0.0.1:2181", connection_retry=zkr)
    client.add_listener(zk_status_listener)
    client.start()

    if client.exists("/servers/master"):

        @client.ChildrenWatch("/servers/")
        def become_master(children):
            if "master" not in children:
                instances = client.get_children("/servers/slaves/")
                if instances[0].split("_")[-1] == PORT:
                    print(">>> This server is the new master")
                    client.create("/servers/master", ephemeral=True)
                    client.set("/servers/master", PORT.encode())

                    if client.exists("/servers/status"):
                        client.delete("/servers/status")
Esempio n. 58
0
class ConsistentScheduler(object):
    '''
        LibPartitionHelper abstract out workers and work_items, and their
        mapping to partitions. So application can only deal with the work
        items it owns, without bothering about partition mapping.

        This class also provides syncronization premitives to ensure apps
        to clean up b4 giving up their partitions
    '''
    _MAX_WAIT_4_ALLOCATION = 6 + randint(0, 9)

    def __init__(self,
                 service_name=None,
                 zookeeper='127.0.0.1:2181',
                 delete_hndlr=None,
                 add_hndlr=None,
                 bucketsize=47,
                 item2part_func=None,
                 partitioner=None,
                 logger=None,
                 cluster_id=''):
        if logger:
            self._logger = logger
        else:
            self._logger = logging.getLogger(__name__)
        self._service_name = service_name or os.path.basename(sys.argv[0])
        self._item2part_func = item2part_func or self._device2partition
        self._zookeeper_srvr = zookeeper
        self._bucketsize = bucketsize
        self._delete_hndlr = delete_hndlr
        self._add_hndlr = add_hndlr
        self._partitioner = partitioner or self._partitioner_func
        self._partitions = {}
        self._con_hash = None
        self._last_log = ''
        self._last_log_cnt = 0
        self._partition_set = map(str, range(self._bucketsize))

        self._cluster_id = cluster_id
        if self._cluster_id:
            self._zk_path = '/' + self._cluster_id + '/contrail_cs' + '/' + self._service_name
        else:
            self._zk_path = '/'.join(['/contrail_cs', self._service_name])
        self._zk = KazooClient(self._zookeeper_srvr,
                               handler=SequentialGeventHandler())
        self._zk.add_listener(self._zk_lstnr)
        self._conn_state = None
        while True:
            try:
                self._zk.start()
                break
            except gevent.event.Timeout as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                gevent.sleep(1)
            # Zookeeper is also throwing exception due to delay in master election
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                gevent.sleep(1)
        self._pc = self._zk.SetPartitioner(path=self._zk_path,
                                           set=self._partition_set,
                                           partition_func=self._partitioner)
        self._wait_allocation = 0
        gevent.sleep(0)

    def _sandesh_connection_info_update(self, status, message):
        from pysandesh.connection_info import ConnectionState
        from pysandesh.gen_py.process_info.ttypes import ConnectionStatus, \
            ConnectionType

        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER,
                               name='Zookeeper',
                               status=new_conn_state,
                               message=message,
                               server_addrs=self._zookeeper_srvr.split(','))

        if ((self._conn_state and self._conn_state != ConnectionStatus.DOWN)
                and new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' % (message)
            self._supress_log(msg)
        if (self._conn_state and self._conn_state != new_conn_state
                and new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._supress_log(msg)

        self._conn_state = new_conn_state

    # end _sandesh_connection_info_update

    def _zk_lstnr(self, state):
        if state == KazooState.CONNECTED:
            # Update connection info
            self._sandesh_connection_info_update(
                status='UP', message='Connection to Zookeeper established')
        elif state == KazooState.LOST:
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all
            # over again
            self._sandesh_connection_info_update(
                status='DOWN', message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            # Update connection info
            self._sandesh_connection_info_update(
                status='INIT',
                message='Connection to zookeeper lost. Retrying')

    def schedule(self, items, lock_timeout=30):
        gevent.sleep(0)
        ret = False
        if self._pc.failed:
            self._logger.error('Lost or unable to acquire partition')
            os._exit(2)
        elif self._pc.release:
            self._supress_log('Releasing...')
            self._release()
        elif self._pc.allocating:
            self._supress_log('Waiting for allocation...')
            self._pc.wait_for_acquire(lock_timeout)
            if self._wait_allocation < self._MAX_WAIT_4_ALLOCATION:
                self._wait_allocation += 1
            else:
                self._logger.error('Giving up after %d tries!' %
                                   (self._wait_allocation))
                os._exit(2)
        elif self._pc.acquired:
            self._supress_log('got work: ', list(self._pc))
            ret = True
            self._wait_allocation = 0
            self._populate_work_items(items)
            self._supress_log('work items: ',
                              self._items2name(self.work_items()),
                              'from the list', self._items2name(items))
        return ret

    def members(self):
        return list(self._con_hash.nodes)

    def partitions(self):
        return list(self._pc)

    def work_items(self):
        return sum(self._partitions.values(), [])

    def finish(self):
        self._inform_delete(self._partitions.keys())
        self._pc.finish()

    def _items2name(self, items):
        return map(lambda x: x.name, items)

    def _supress_log(self, *s):
        slog = ' '.join(map(str, s))
        dl = ''
        if slog != self._last_log_cnt:
            if self._last_log_cnt:
                dl += ' ' * 4
                dl += '.' * 8
                dl += '[last print repeats %d times]' % self._last_log_cnt
                self._last_log_cnt = 0
            dl += slog
            self._last_log = slog
            self._logger.debug(dl)
        else:
            self._last_log_cnt += 1

    def _consistent_hash(self, members):
        if self._con_hash is None:
            self._con_hash = ConsistentHash(members)
            self._logger.error('members: %s' % (str(self._con_hash.nodes)))
        cur, updtd = set(self._con_hash.nodes), set(members)
        if cur != updtd:
            newm = updtd - cur
            rmvd = cur - updtd
            if newm:
                self._logger.error('new members: %s' % (str(newm)))
                self._con_hash.add_nodes(list(newm))
            if rmvd:
                self._logger.error('members left: %s' % (str(rmvd)))
                self._con_hash.del_nodes(list(rmvd))
        return self._con_hash

    def _consistent_hash_get_node(self, members, partition):
        return self._consistent_hash(members).get_node(partition)

    def _partitioner_func(self, identifier, members, _partitions):
        partitions = [p for p in _partitions \
            if self._consistent_hash_get_node(members, p) == identifier]
        self._logger.error('partitions: %s' % (str(partitions)))
        return partitions

    def _release(self):
        old = set(self._pc)
        new = set(
            self._partitioner(self._pc._identifier, list(self._pc._party),
                              self._partition_set))
        rmvd = old - new
        added = new - old
        if rmvd:
            self._inform_delete(list(rmvd))
        if added:
            self._inform_will_add(list(added))
        self._pc.release_set()

    def _list_items_in(self, partitions):
        return sum([self._partitions[k] for k in partitions if k in \
                    self._partitions], [])

    def _inform_will_add(self, partitions):
        if callable(self._add_hndlr):
            self._add_hndlr(self._list_items_in(partitions))

    def _inform_delete(self, partitions):
        if callable(self._delete_hndlr):
            self._delete_hndlr(self._list_items_in(partitions))

    def _populate_work_items(self, items):
        self._refresh_work_items()
        for i in items:
            part = str(self._item2part_func(i.name))
            if part in list(self._pc):
                if part not in self._partitions:
                    self._partitions[part] = []
                if i.name not in map(lambda x: x.name, self._partitions[part]):
                    self._partitions[part].append(i)
        self._logger.debug('@populate_work_items(%s): done!' % ' '.join(
            map(
                lambda v: str(v[0]) + ':' + ','.join(
                    map(lambda x: x.name, v[1])), self._partitions.items())))
        gevent.sleep(0)

    def _device2partition(self, key):
        return struct.unpack(
            'Q',
            hashlib.md5(key).digest()[-8:])[0] % self._bucketsize

    def _refresh_work_items(self):
        for k in self._partitions:
            self._partitions[k] = []
class USSMetadataManager(object):
    """Interfaces with the locking system to get, put, and delete USS metadata.

  Metadata gets/stores/deletes the USS information for a partiular grid,
  including current version number, a list of USSs with active operations,
  and the endpoints to get that information. Locking is assured through a
  snapshot token received when getting, and used when putting.
  """
    def __init__(self, connectionstring=DEFAULT_CONNECTION, testgroupid=None):
        """Initializes the class.

    Args:
      connectionstring:
        Zookeeper connection string - server:port,server:port,...
      testgroupid:
        ID to use if in test mode, none for normal mode
    """
        if testgroupid:
            self.set_testmode(testgroupid)
        if not connectionstring:
            connectionstring = DEFAULT_CONNECTION
        log.debug(
            'Creating metadata manager object and connecting to zookeeper...')
        try:
            if set(BAD_CHARACTER_CHECK) & set(connectionstring):
                raise ValueError
            self.zk = KazooClient(hosts=connectionstring,
                                  timeout=CONNECTION_TIMEOUT)
            self.zk.add_listener(self.zookeeper_connection_listener)
            self.zk.start()
            if testgroupid:
                self.delete_testdata(testgroupid)
        except KazooTimeoutError:
            log.error(
                'Unable to connect to zookeeper using %s connection string...',
                connectionstring)
            raise
        except ValueError:
            log.error('Connection string %s seems invalid...',
                      connectionstring)
            raise

    def __del__(self):
        log.debug(
            'Destroying metadata manager object and disconnecting from zk...')
        self.zk.stop()

    def get_state(self):
        return self.zk.state

    def get_version(self):
        try:
            return True, self.zk.server_version()
        except KazooException as e:
            msg = str(e)
            return False, type(e).__name__ + (' ' + msg if msg else '')

    def set_verbose(self):
        log.setLevel(logging.DEBUG)

    def set_testmode(self, testgroupid='UNDEFINED_TESTER'):
        """Sets the mode to testing with the specific test ID, cannot be undone.

    Args:
      testgroupid: ID to use if in test mode, none for normal mode
    """
        global GRID_PATH
        global CONNECTION_TIMEOUT
        # Adjust parameters specifically for the test
        GRID_PATH = TEST_BASE_PREFIX + testgroupid + USS_BASE_PREFIX
        log.debug('Setting test path to %s...', GRID_PATH)
        CONNECTION_TIMEOUT = 1.0

    def zookeeper_connection_listener(self, state):
        if state == KazooState.LOST:
            # Register somewhere that the session was lost
            log.error('Lost connection with the zookeeper servers...')
        elif state == KazooState.SUSPENDED:
            # Handle being disconnected from Zookeeper
            log.error('Suspended connection with the zookeeper servers...')
        elif state == KazooState.CONNECTED:
            # Handle being connected/reconnected to Zookeeper
            log.info('Connection restored with the zookeeper servers...')

    def delete_testdata(self, testgroupid=None):
        """Removes the test data from the servers.

    Be careful when using this in parallel as it removes everything under
    the testgroupid, or everything if no tetgroupid is provided.

    Args:
      testgroupid: ID to use if in test mode, none will remove all test data
    """
        if testgroupid:
            path = TEST_BASE_PREFIX + testgroupid
        else:
            path = TEST_BASE_PREFIX
        self.zk.delete(path, recursive=True)

    def get(self, z, x, y):
        """Gets the metadata and snapshot token for a GridCell.

    Reads data from zookeeper, including a snapshot token. The
    snapshot token is used as a reference when writing to ensure
    the data has not been updated between read and write.

    Args:
      z: zoom level in slippy tile format
      x: x tile number in slippy tile format
      y: y tile number in slippy tile format
    Returns:
      JSend formatted response (https://labs.omniti.com/labs/jsend)
    """
        # TODO(hikevin): Change to use our own error codes and let the server
        #                   convert them to http error codes. For now, this is
        #                   at least in a standard JSend format.
        status = 500
        if slippy_util.validate_slippy(z, x, y):
            (content, metadata) = self._get_raw(z, x, y)
            if metadata:
                try:
                    m = uss_metadata.USSMetadata(content)
                    status = 200
                    result = {
                        'status': 'success',
                        'sync_token': metadata.last_modified_transaction_id,
                        'data': m.to_json()
                    }
                except ValueError:
                    status = 424
            else:
                status = 404
        else:
            status = 400
        if status != 200:
            result = self._format_status_code_to_jsend(status)
        return result

    def set(self, z, x, y, sync_token, uss_id, ws_scope, operation_format,
            operation_ws, earliest_operation, latest_operation):
        """Sets the metadata for a GridCell.

    Writes data, using the snapshot token for confirming data
    has not been updated since it was last read.

    Args:
      z: zoom level in slippy tile format
      x: x tile number in slippy tile format
      y: y tile number in slippy tile format
      sync_token: token retrieved in the original GET GridCellMetadata,
      uss_id: plain text identifier for the USS,
      ws_scope: scope to use to obtain OAuth token,
      operation_format: output format for operation ws (i.e. NASA, GUTMA),
      operation_ws: submitting USS endpoint where all flights in
        this cell can be retrieved from,
      earliest_operation: lower bound of active or planned flight timestamp,
        used for quick filtering conflicts.
      latest_operation: upper bound of active or planned flight timestamp,
        used for quick filtering conflicts.
    Returns:
      JSend formatted response (https://labs.omniti.com/labs/jsend)
    """
        if slippy_util.validate_slippy(z, x, y):
            # first we have to get the cell
            (content, metadata) = self._get_raw(z, x, y)
            if metadata:
                # Quick check of the token, another is done on the actual set to be sure
                #    but this check fails early and fast
                if str(metadata.last_modified_transaction_id) == str(
                        sync_token):
                    try:
                        m = uss_metadata.USSMetadata(content)
                        log.debug('Setting metadata for %s...', uss_id)
                        if not m.upsert_operator(
                                uss_id, ws_scope, operation_format,
                                operation_ws, earliest_operation,
                                latest_operation, z, x, y):
                            log.error(
                                'Failed setting operator for %s with token %s...',
                                uss_id, str(sync_token))
                            raise ValueError
                        status = self._set_raw(z, x, y, m, metadata.version)
                    except ValueError:
                        status = 424
                else:
                    status = 409
            else:
                status = 404
        else:
            status = 400
        if status == 200:
            # Success, now get the metadata back to send back
            result = self.get(z, x, y)
        else:
            result = self._format_status_code_to_jsend(status)
        return result

    def delete(self, z, x, y, uss_id):
        """Sets the metadata for a GridCell by removing the entry for the USS.

    Args:
      z: zoom level in slippy tile format
      x: x tile number in slippy tile format
      y: y tile number in slippy tile format
      uss_id: is the plain text identifier for the USS
    Returns:
      JSend formatted response (https://labs.omniti.com/labs/jsend)
    """
        status = 500
        if slippy_util.validate_slippy(z, x, y):
            # first we have to get the cell
            (content, metadata) = self._get_raw(z, x, y)
            if metadata:
                try:
                    m = uss_metadata.USSMetadata(content)
                    m.remove_operator(uss_id)
                    # TODO(pelletierb): Automatically retry on delete
                    status = self._set_raw(z, x, y, m, metadata.version)
                except ValueError:
                    status = 424
            else:
                status = 404
        else:
            status = 400
        if status == 200:
            # Success, now get the metadata back to send back
            (content, metadata) = self._get_raw(z, x, y)
            result = {
                'status': 'success',
                'sync_token': metadata.last_modified_transaction_id,
                'data': m.to_json()
            }
        else:
            result = self._format_status_code_to_jsend(status)
        return result

    def get_multi(self, z, grids):
        """Gets the metadata and snapshot token for multiple GridCells.

    Reads data from zookeeper, including a composite snapshot token. The
    snapshot token is used as a reference when writing to ensure
    the data has not been updated between read and write.

    Args:
      z: zoom level in slippy tile format
      grids: list of (x,y) tiles to retrieve
    Returns:
      JSend formatted response (https://labs.omniti.com/labs/jsend)
    """
        try:
            combined_meta, syncs = self._get_multi_raw(z, grids)
            log.debug('Found sync token %s for %d grids...',
                      self._hash_sync_tokens(syncs), len(syncs))
            result = {
                'status': 'success',
                'sync_token': self._hash_sync_tokens(syncs),
                'data': combined_meta.to_json()
            }
        except ValueError as e:
            result = self._format_status_code_to_jsend(400, e.message)
        except IndexError as e:
            result = self._format_status_code_to_jsend(404, e.message)
        return result

    def set_multi(self, z, grids, sync_token, uss_id, ws_scope,
                  operation_format, operation_ws, earliest_operation,
                  latest_operation):
        """Sets multiple GridCells metadata at once.

    Writes data, using the hashed snapshot token for confirming data
    has not been updated since it was last read.

    Args:
      z: zoom level in slippy tile format
      grids: list of (x,y) tiles to update
      sync_token: token retrieved in the original get_multi,
      uss_id: plain text identifier for the USS,
      ws_scope: scope to use to obtain OAuth token,
      operation_format: output format for operation ws (i.e. NASA, GUTMA),
      operation_ws: submitting USS endpoint where all flights in
        this cell can be retrieved from,
      earliest_operation: lower bound of active or planned flight timestamp,
        used for quick filtering conflicts.
      latest_operation: upper bound of active or planned flight timestamp,
        used for quick filtering conflicts.
    Returns:
      JSend formatted response (https://labs.omniti.com/labs/jsend)
    """
        log.debug('Setting multiple grid metadata for %s...', uss_id)
        try:
            # first, get the affected grid's sync tokens
            m, syncs = self._get_multi_raw(z, grids)
            del m
            # Quick check of the token, another is done on the actual set to be sure
            #    but this check fails early and fast
            log.debug('Found sync token %d for %d grids...',
                      self._hash_sync_tokens(syncs), len(syncs))
            if str(self._hash_sync_tokens(syncs)) == str(sync_token):
                log.debug('Composite sync_token matches, continuing...')
                self._set_multi_raw(z, grids, syncs, uss_id, ws_scope,
                                    operation_format, operation_ws,
                                    earliest_operation, latest_operation)
                log.debug('Completed updating multiple grids...')
            else:
                raise KeyError('Composite sync_token has changed')
            combined_meta, new_syncs = self._get_multi_raw(z, grids)
            result = {
                'status': 'success',
                'sync_token': self._hash_sync_tokens(new_syncs),
                'data': combined_meta.to_json()
            }
        except (KeyError, RolledBackError) as e:
            result = self._format_status_code_to_jsend(409, e.message)
        except ValueError as e:
            result = self._format_status_code_to_jsend(400, e.message)
        except IndexError as e:
            result = self._format_status_code_to_jsend(404, e.message)
        return result

    def delete_multi(self, z, grids, uss_id):
        """Sets multiple GridCells metadata by removing the entry for the USS.

    Removes the operator from multiple cells. Does not return 404 on
    not finding the USS in a cell, since this should be a remove all
    type function, as some cells might have the ussid and some might not.
    
    Args:
      z: zoom level in slippy tile format
      grids: list of (x,y) tiles to delete
      uss_id: is the plain text identifier for the USS
    Returns:
      JSend formatted response (https://labs.omniti.com/labs/jsend)
    """
        log.debug('Deleting multiple grid metadata for %s...', uss_id)
        try:
            if not uss_id:
                raise ValueError('Invalid uss_id for deleting multi')
            for x, y in grids:
                if slippy_util.validate_slippy(z, x, y):
                    (content, metadata) = self._get_raw(z, x, y)
                    if metadata:
                        m = uss_metadata.USSMetadata(content)
                        m.remove_operator(uss_id)
                        # TODO(pelletierb): Automatically retry on delete
                        status = self._set_raw(z, x, y, m, metadata.version)
                else:
                    raise ValueError('Invalid slippy grids for lookup')
            result = self.get_multi(z, grids)
        except ValueError as e:
            result = self._format_status_code_to_jsend(400, e.message)
        return result

    ######################################################################
    ################       INTERNAL FUNCTIONS    #########################
    ######################################################################
    def _get_raw(self, z, x, y):
        """Gets the raw content and metadata for a GridCell from zookeeper.

    Args:
      z: zoom level in slippy tile format
      x: x tile number in slippy tile format
      y: y tile number in slippy tile format
    Returns:
      content: USS metadata
      metadata: straight from zookeeper
    """
        path = '%s/%s/%s/%s/%s' % (GRID_PATH, str(z), str(x), str(y),
                                   USS_METADATA_FILE)
        log.debug('Getting metadata from zookeeper@%s...', path)
        try:
            c, m = self.zk.get(path)
        except NoNodeError:
            self.zk.ensure_path(path)
            c, m = self.zk.get(path)
        if c:
            log.debug('Received raw content and metadata from zookeeper: %s',
                      c)
        if m:
            log.debug('Received raw metadata from zookeeper: %s', m)
        return c, m

    def _set_raw(self, z, x, y, m, version):
        """Grabs the lock and updates the raw content for a GridCell in zookeeper.

    Args:
      z: zoom level in slippy tile format
      x: x tile number in slippy tile format
      y: y tile number in slippy tile format
      m: metadata object to write
      version: the metadata version verified from the sync_token match
    Returns:
      200 for success, 409 for conflict, 408 for unable to get the lock
    """
        path = '%s/%s/%s/%s/%s' % (GRID_PATH, str(z), str(x), str(y),
                                   USS_METADATA_FILE)
        try:
            log.debug('Setting metadata to %s...', str(m))
            self.zk.set(path, json.dumps(m.to_json()), version)
            status = 200
        except BadVersionError:
            log.error('Sync token updated before write for %s...', path)
            status = 409
        return status

    def _get_multi_raw(self, z, grids):
        """Gets the raw content and metadata for multiple GridCells from zookeeper.

    Args:
      z: zoom level in slippy tile format
      grids: list of (x,y) tiles to retrieve
    Returns:
      content: Combined USS metadata
      syncs: list of sync tokens in the same order as the grids
    Raises:
      IndexError: if it cannot find anything in zookeeper
      ValueError: if the grid data is not in the right format
    """
        log.debug('Getting multiple grid metadata for %s...', str(grids))
        combined_meta = None
        syncs = []
        for x, y in grids:
            if slippy_util.validate_slippy(z, x, y):
                (content, metadata) = self._get_raw(z, x, y)
                if metadata:
                    combined_meta += uss_metadata.USSMetadata(content)
                    syncs.append(metadata.last_modified_transaction_id)
                else:
                    raise IndexError('Unable to find metadata in platform')
            else:
                raise ValueError('Invalid slippy grids for lookup')
        if len(syncs) == 0:
            raise IndexError('Unable to find metadata in platform')
        return combined_meta, syncs

    def _set_multi_raw(self, z, grids, sync_tokens, uss_id, ws_scope,
                       operation_format, operation_ws, earliest_operation,
                       latest_operation):
        """Grabs the lock and updates the raw content for multiple GridCells

    Args:
      z: zoom level in slippy tile format
      grids: list of (x,y) tiles to retrieve
      sync_tokens: list of the sync tokens received during get operation
      uss_id: plain text identifier for the USS,
      ws_scope: scope to use to obtain OAuth token,
      operation_format: output format for operation ws (i.e. NASA, GUTMA),
      operation_ws: submitting USS endpoint where all flights in
        this cell can be retrieved from,
      earliest_operation: lower bound of active or planned flight timestamp,
        used for quick filtering conflicts.
      latest_operation: upper bound of active or planned flight timestamp,
        used for quick filtering conflicts.
    Raises:
      IndexError: if it cannot find anything in zookeeper
      ValueError: if the grid data is not in the right format
    """
        log.debug('Setting multiple grid metadata for %s...', str(grids))
        try:
            contents = []
            for i in range(len(grids)):
                # First, get and update them all in memory, validate the sync_token
                x = grids[i][0]
                y = grids[i][1]
                sync_token = sync_tokens[i]
                path = '%s/%s/%s/%s/%s' % (GRID_PATH, str(z), str(x), str(y),
                                           USS_METADATA_FILE)
                (content, metadata) = self._get_raw(z, x, y)
                if str(metadata.last_modified_transaction_id) == str(
                        sync_token):
                    log.debug('Sync_token matches for %d, %d...', x, y)
                    m = uss_metadata.USSMetadata(content)
                    if not m.upsert_operator(
                            uss_id, ws_scope, operation_format, operation_ws,
                            earliest_operation, latest_operation, z, x, y):
                        raise ValueError('Failed to set operator content')
                    contents.append((path, m, metadata.version))
                else:
                    log.error(
                        'Sync token from USS (%s) does not match token from zk (%s)...',
                        str(sync_token),
                        str(metadata.last_modified_transaction_id))
                    raise KeyError('Composite sync_token has changed')
            # Now, start a transaction to update them all
            #  the version will catch any changes and roll back any attempted
            #  updates to the grids
            log.debug('Starting transaction to write all grids at once...')
            t = self.zk.transaction()
            for path, m, version in contents:
                t.set_data(path, json.dumps(m.to_json()), version)
            log.debug('Committing transaction...')
            results = t.commit()
            if isinstance(results[0], RolledBackError):
                raise KeyError(
                    'Rolled back multi-grid transaction due to grid change')
            log.debug('Committed transaction successfully.')
        except (KeyError, ValueError, IndexError) as e:
            log.error('Error caught in set_multi_raw %s.', e.message)
            raise e

    def _format_status_code_to_jsend(self, status, message=None):
        """Formats a response based on HTTP status code.

    Args:
      status: HTTP status code
      message: optional message to override preset message for codes
    Returns:
      JSend formatted response (https://labs.omniti.com/labs/jsend)
    """

        if status == 200 or status == 204:
            result = {
                'status': 'success',
                'code': 204,
                'message': 'Empty data set.'
            }
        elif status == 400:
            result = {
                'status': 'fail',
                'code': status,
                'message': 'Parameters are not following the correct format.'
            }
        elif status == 404:
            result = {
                'status': 'fail',
                'code': status,
                'message': 'Unable to pull metadata from lock system.'
            }
        elif status == 408:
            result = {
                'status': 'fail',
                'code': status,
                'message': 'Timeout trying to get lock.'
            }
        elif status == 409:
            result = {
                'status':
                'fail',
                'code':
                status,
                'message':
                'Content in metadata has been updated since provided sync token.'
            }
        elif status == 424:
            result = {
                'status':
                'fail',
                'code':
                status,
                'message':
                'Content in metadata is not following JSON format guidelines.'
            }
        else:
            result = {
                'status': 'fail',
                'code': status,
                'message': 'Unknown error code occurred.'
            }
        if message:
            result['message'] = message
        return result

    @staticmethod
    def _hash_sync_tokens(syncs):
        """Hashes a list of sync tokens into a single, positive 64-bit int.

    For various languages, the limit to integers may be different, therefore
    we truncate to ensure the hash is the same on all implementations.
    """
        return abs(hash(tuple(sorted(syncs)))) % MAX_SAFE_INTEGER
Esempio n. 60
0
class ClusterZookeeper(object):
    def __init__(self, zookeeper_hosts, kafka_hosts):
        self.groups_dict = {}
        self.topics_dict = {}
        self.brokers_list = []
        self.consumer = KafkaConsumer(bootstrap_servers=kafka_hosts.split(','))
        self.zk = KazooClient(hosts=zookeeper_hosts)
        self.zk.add_listener(self.keep_start)
        self.zk.start()
        if self.zk.exists('/consumers') is None or self.zk.exists('/brokers') is None:
            raise ValueError(zookeeper_hosts + 'is not zookeeper of kafka')
        ChildrenWatch(self.zk, '/consumers', self.groups_watch)
        ChildrenWatch(self.zk, '/brokers/topics', self.topics_watch)
        ChildrenWatch(self.zk, '/brokers/ids/', self.brokers_watch)
        t = threading.Thread(target=self.latest, name=kafka_hosts)
        t.setDaemon(True)
        t.start()

    # 保证链接是可用的
    def keep_start(self, client_status):
        if client_status != 'CONNECTED':
            try:
                self.zk.start()
            except():
                pass

    # 监听consumers节点
    def groups_watch(self, children):
        for group in [group for group in self.groups_dict.keys() if group not in children]:
            self.groups_dict.pop(group)
        for group in [group for group in children if group not in self.groups_dict.keys()]:
            owners_p = '/consumers/' + group + '/owners'
            if self.zk.exists(owners_p) is None:
                continue
            g_o_t = GroupOwnersTopic()
            self.groups_dict[group] = g_o_t
            ChildrenWatch(self.zk, owners_p, g_o_t.g_topic_watch)

    # 监听topic节点
    def topics_watch(self, children):
        for topic in [topic for topic in self.topics_dict.keys() if topic not in children]:
            self.topics_dict.pop(topic)
        for topic in [topic for topic in children if topic not in self.topics_dict.keys()]:
            t_v = TopicValue()
            self.topics_dict[topic] = t_v
            DataWatch(self.zk, '/brokers/topics/' + topic, t_v.topic_watch)
            t_v.topic_partition = [TopicPartition(topic, p) for p in self.consumer.partitions_for_topic(topic)]

    # 监听broker节点
    def brokers_watch(self, children):
        self.brokers_list = children

    def close_zk(self):
        try:
            self.zk.remove_listener(self.keep_start)
            self.zk.stop()
            self.zk.close()
        except():
            pass

    def latest(self):
        while True:
            # time.sleep(0.1)
            time.sleep(0.001)
            for k, v in self.topics_dict.items():
                try:
                    partitions = v.topic_partition
                    self.consumer.assign(partitions)
                    self.consumer.seek_to_end(*partitions)
                    log_offset = reduce(lambda x, y: x + y, [self.consumer.position(p) for p in partitions])
                    now_timestamp = int(time.mktime(time.localtime()))
                    if 'timestamp' in v.__dict__ and v.timestamp is not None:
                        v.speed = (log_offset - v.off_set) / (now_timestamp - v.timestamp)
                    v.timestamp = now_timestamp
                    v.off_set = log_offset
                except Exception as e:
                    pass