class HHFrontierWorker(FrontierWorker): def __init__(self, settings, no_batches, no_scoring, no_incoming): super(HHFrontierWorker, self).__init__(settings, no_batches, no_scoring, no_incoming) self.init_zookeeper() def init_zookeeper(self): self._zk = KazooClient(hosts=settings.get('ZOOKEEPER_LOCATION')) self._zk.add_listener(self.zookeeper_listener) self._zk.start() self.znode_path = self._zk.create("/frontera/hh-f-worker", ephemeral=True, sequence=True, makepath=True) def zookeeper_listener(self, state): if state == KazooState.LOST: # Register somewhere that the session was lost pass elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper pass else: # Handle being connected/reconnected to Zookeeper pass def set_process_info(self, process_info): self.process_info = process_info self._zk.set(self.znode_path, self.process_info) def set_job_id(self, job_id): self._backend.set_job_id(job_id) self.job_id = job_id
def start(): global zk zk = KazooClient() if shell.config['barrier'] is True: path_barrier = '/'+shell.config['identity']+'/barrier' value_barrier = json.dumps({'NodeId':shell.config['nodeid']}, encoding='utf-8') @zk.DataWatch(path_barrier) def watch_node(data, stat, event): global flag if event: logging.info("Node Event %s %s, data %s" %(event.path, event.type, data)) if event.type == EventType.DELETED: flag[0] = True if flag[1]: zk.handler.spawn(create_ephemeral) else: flag[1] = True zk.add_listener(my_listener) try: zk.start() except Exception as e: logging.error(e) sys.exit(1)
def connect_to_zk(): zookeeper_connect_string = os.getenv('ZOOKEEPER_CONN_STRING') zk = KazooClient(hosts=zookeeper_connect_string) zk.start() zk.add_listener(state_listener) logging.info("connected to Zookeeper") return zk
class Exhibitor: def __init__(self, exhibitor, chroot): self.chroot = chroot self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=30) self.client = KazooClient(hosts=self.exhibitor.zookeeper_hosts + self.chroot, command_retry={ 'deadline': 10, 'max_delay': 1, 'max_tries': -1}, connection_retry={'max_delay': 1, 'max_tries': -1}) self.client.add_listener(self.session_listener) self.client.start() def session_listener(self, state): pass def _poll_exhibitor(self): if self.exhibitor.poll(): self.client.set_hosts(self.exhibitor.zookeeper_hosts + self.chroot) def get(self, *params): self._poll_exhibitor() return self.client.retry(self.client.get, *params) def get_children(self, *params): self._poll_exhibitor() try: return self.client.retry(self.client.get_children, *params) except NoNodeError: return []
class ZookeeperSession(object): def __init__(self, locations, name_prefix, root_prefix='/frontera'): self._zk = KazooClient(hosts=locations) self._zk.add_listener(self.zookeeper_listener) self._zk.start() self.root_prefix = root_prefix self.znode_path = self._zk.create("%s/%s" % (self.root_prefix, name_prefix), ephemeral=True, sequence=True, makepath=True) def zookeeper_listener(self, state): if state == KazooState.LOST: # Register somewhere that the session was lost pass elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper pass else: # Handle being connected/reconnected to Zookeeper pass def set(self, value): self._zk.set(self.znode_path, value) def get_workers(self, prefix='', exclude_prefix=''): for znode_name in self._zk.get_children(self.root_prefix): if prefix and not znode_name.startswith(prefix): continue if exclude_prefix and znode_name.startswith(exclude_prefix): continue location, _ = self._zk.get(self.root_prefix+"/"+znode_name) yield location
class ZKStore: def __init__(self, hosts): self.zk = KazooClient(hosts=hosts) self.zk.add_listener(listener) self.zk.start() def isConnected(self): if __state__ == 1: return True return False def write(self, path, node, value): self.zk.ensure_path(path) if self.zk.exists(path+"/"+node): self.zk.set(path+"/"+node, value) else: self.zk.create(path + "/" + node, value) def read(self, path): if self.zk.exists(path): data, stat = self.zk.get(path) return data return None
def start(servers=None, force_reconnect=False): global zk log.warn("start zookeeper current:%s" % zk) if zk and not force_reconnect: return server_list = [] if not servers: try: with open("/opt/cluster/zookeeper_addresses.json") as f: info = json.loads(f.read()) for s in info.get("cluster"): server_list.append("%s:2181" % s.split(":")[0]) except: pass if len(server_list) == 0: server_list.append("127.0.0.1:2181") servers = str(",".join(server_list)) else: server_list = servers.split(",") def pick_server(): for t in xrange(len(server_list)): server = server_list[random.randint(0, len(server_list) - 1)] try: host, port = server.split(":") except: host, port = server, 2181 try: log.warn("checking service %s:%s" % (host, port)) s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(3) s.connect((str(host), int(port))) s.close() log.warn("checking service %s:%s success" % (host, port)) return server except: log.warn("checking service %s:%s failed" % (host, port)) continue log.warn("Zookeeper connect to: %s connect" % (servers)) logger = logging.getLogger("kazoo") logger.setLevel(logging.WARNING) zk = KazooClient(hosts=servers, timeout=ztimeout, logger=logger) zk.add_listener(connection_state) zk.start(timeout=ztimeout) # zk *1000 ile carpiyor, carpmamali. log.warn("connected") wait_time = 0 while True: log.warn("wait %s" % current_state) if current_state == "CONNECTED": log.warn("return") break if wait_time > 10: log.warn("wait too much killing myself") os.kill(os.getpid(), signal.SIGKILL) os._exit(1) time.sleep(1) wait_time += 1
def achieve_consensus(self): """Trigger consensus logic and handle errors.""" log.info('Set up ZK client using host(s): %s', self._hosts) zk = KazooClient(hosts=self._hosts) # Initialize ZK connection state variable, which is shared across # threads. It is updated from a change listener function which is # invoked from within a Kazoo connection management thread, see # http://kazoo.readthedocs.org/en/latest/api/handlers/threading.html. self._connected = False zk.add_listener(self._zk_state_change_listener) zk.start() # Wait for handling thread to update connection status. (As of non- # determinism around GIL context switches there is otherwise no # guarantee that the status is updated within # `_run_consensus_procedure`). while not self._connected: time.sleep(0.01) self._zk = zk try: # This may raise ConnectionLost or various # kazoo.exceptions.* types. return self._run_consensus_procedure() finally: log.info('Shut down ZK client.') try: zk.stop() finally: zk.close()
def consumer_group(request, cluster_id, group_id): cluster = get_cluster_or_404(id=cluster_id) zk = KazooClient(hosts=cluster['zk_host_ports']) zk.add_listener(my_listener) zk.start() consumer_group = _get_consumer_group(zk=zk,cluster=cluster,group_id=group_id) zk.stop() return render('consumer_group.mako', request, {'cluster': cluster, 'consumer_group':consumer_group})
def connect_to_zookeepr(self): """ Connect to zookeeper """ zk = KazooClient(hosts=self.zookeeper_urls, read_only=True) zk.start() zk.add_listener(self.my_listener) return zk
class SentinelDaemon(object): def __init__(self, port): """ Read config and spawn child processes. :type port: int """ self._log = logging.getLogger('sent.daemon') self._log.info('Creating Sentinel') self._port = port self.children = dict() self._settings = None self._system = get_system() self._hostname = socket.getfqdn() self._prev_state = None self.listener_lock = Lock() self.version = get_version() self.task_client = None self.zkclient = KazooClient(hosts=get_zk_conn_string(), timeout=60.0, handler=SequentialThreadingHandler(), logger=logging.getLogger('kazoo.daemon')) self.zkclient.add_listener(self._zk_listener) # this will run self._reset_after_connection_loss self.zkclient.start() while not self._settings: self._log.info('Waiting for settings.') time.sleep(1) self._tmp_dir = os.path.join(self._settings.get('zookeeper').get('temp_directory', '/'), 'ruok') self.task_client = ZKTaskClient(self.children, self.zkclient, self._settings.get('zookeeper', {}).get('task')) self._rest_server = tornado.httpserver.HTTPServer(RestServer(self.children, self.version, self._tmp_dir, self._hostname, self.zkclient)) signal.signal(signal.SIGINT, self._handle_sigint) signal.signal(signal.SIGTERM, self._handle_sigint) self._log.info('Created Sentinel') def __enter__(self): logging.info('Starting Sentinel, listening on port {}'.format(self._port)) if platform.system() == 'Linux': import resource try: resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)) logging.info('Set RLIMIT_CORE to unlimited. Core files can be generated by sentinel on this system.') except ValueError, ve: logging.info('Invalid resource limit specified. Core files will not be generated for apps: {0}'.format(ve)) except AttributeError, ae: logging.info('AttributeError experienced: {0}'.format(ae))
class dzk: def __init__(self,hosts,secs): self.hosts = hosts #self.zk = KazooClient(hosts='1.1.1.3:2181,1.1.1.2:2181,1.1.1.1:2181',retry_max_delay=2000) self.zk = KazooClient(hosts=self.hosts) try: self.zk.start() self.zk.add_listener(self.listener) except Exception,e: print "ERROR connect LOST ==============>"
def _get_consumers(cluster): zk = KazooClient(hosts=cluster['zk_host_ports']) zk.add_listener(my_listener) zk.start() groups = _get_consumer_groups(zk,cluster['id']) consumer_groups = [] for group in groups: consumer_groups.append(_get_consumer_group(zk=zk,cluster=cluster,group_id=group)) zk.stop() return consumer_groups
class dzk: def __init__(self): self.BasePath = "/my/" self.zk = KazooClient(hosts='x.24.79.51:2181,x.24.79.53:2181',retry_max_delay=2000) self.zk.start() self.zk.add_listener(self.listener) def listener(state): if state == KazooState.LOST: self.zk.start() elif state == KazooState.SUSPENDED: print "*******listener saw KazooState.LOST" else: print "*******listener saw KazooState.CONNECT" def getIpHost(self): self.myname = socket.getfqdn(socket.gethostname()) myip = socket.gethostbyname(self.myname) return myip def register(self): ip = self.getIpHost() if ip: NODE = self.BasePath + ip print "register:",NODE else: print "[ERROR:] %s does not exist " %(NODE) sys.exit(2) if not self.zk.exists(NODE): self.zk.ensure_path(NODE) def getData(self): ip = self.getIpHost() if ip: NODE = self.BasePath + ip else: print "[ERROR:] %s does not exist " %(NODE) if self.zk.exists(NODE): data, stat = self.zk.get(NODE) print("Version: %s, data: %s" % (stat.version, data.decode("utf-8"))) def monitor(self): pass def heartbeat(self): pass def role(self): pass def command(self): pass
def get_zk(zkhosts, timeout, command_retry=None, connection_retry=None): ''' Initiate a zookeeper connection and add a listener ''' conn = KazooClient(hosts=zkhosts, timeout=timeout, command_retry=command_retry, connection_retry=connection_retry) conn.add_listener(listener) try: conn.start() except KazooTimeoutError as exc: log.error(exc) sys.exit(1) return conn
def main(): zk = KazooClient(hosts="127.0.0.1:2181", timeout=2.0) zk.add_listener(my_listener) zk.start() if zk.exists("/ELECTION") == None: zk.ensure_path("/ELECTION") c = 1 node_pathes = [] while c < 10: c += 1 node_path = zk.create("/ELECTION/guid-n_", b"a value", ephemeral=True, sequence=True) node_pathes.append(node_path) my_path = random.choice(node_pathes) my_path = my_path.replace("/ELECTION/", "") # print "print my_path", my_path children = zk.get_children("/ELECTION/", watch=election_child_watcher) get_next_path = False prev_path = None for child_path in sorted(children): if child_path == my_path: break prev_path = child_path # I'm the leader if prev_path == None: print "OK I'm leader don't have to watch" return # fires twice, once on creation ignore @zk.DataWatch("/ELECTION/" + prev_path) def watch_node(data, stat): # only watch for first change if stat.version == 1: print ("Version: %s, data: %s" % (stat.version, data.decode("utf-8"))) print "setting watch on " + prev_path print "my", my_path zk.set("/ELECTION/" + prev_path, b"some data") print "boom. watch triggered?" # time.sleep(10) print "bye" zk.stop()
def connect(self): """Initialize a connection to the Zookeeper quorum. :return: Kazoo client object as connection. """ client_kwargs = dict( hosts=self.app.config['KAZOO_HOSTS'], timeout=self.app.config['KAZOO_SESSION_TIMEOUT'], connection_retry=self.app.config['KAZOO_RETRY'], command_retry=self.app.config['KAZOO_RETRY'] ) # is ACL ? username = self.app.config.get('KAZOO_ACL_USERNAME', None) password = self.app.config.get('KAZOO_ACL_PASSWORD', None) if username and password: client_kwargs.update(dict( default_acl=[ make_digest_acl( username=username, password=password, read=self.app.config.get( 'KAZOO_ACL_READ', False ), write=self.app.config.get( 'KAZOO_ACL_WRITE', False ), create=self.app.config.get( 'KAZOO_ACL_CREATE', False ), delete=self.app.config.get( 'KAZOO_ACL_DELETE', False ), admin=self.app.config.get( 'KAZOO_ACL_ADMIN', False ), all=self.app.config.get( 'KAZOO_ACL_ALL', False ) ) ], auth_data=[("digest", ":".join((username, password)))], )) client = KazooClient(**client_kwargs) client.start(timeout=self.app.config['KAZOO_START_TIMEOUT']) client.add_listener(self.connection_state_listener) return client
class ZookeeperSession(BaseClient): conext_manager = ZookeeperResponseContextManager loose_policy = {} strict_policy = {} def __init__(self,server_list='127.0.0.1:2181',*args,**kwargs): super(ZookeeperSession,self).__init__(*args,**kwargs) self.session_policy = "loose_policy" self._zookeeper_client = None self.server_list = server_list def set_session_policy(self,session_policy="loose"): '''prototype not currenlty used. ''' self.session_policy = session_policy+"_policy" def connect(self,*args,**kwargs): '''See http://kazoo.readthedocs.org/en/latest/api/client.html for details regarding available options. Any provided client start() parameters provided will override defaults. ''' defaults = { "hosts" : self.server_list, "handler" : SequentialGeventHandler() } defaults.update(getattr(self,self.session_policy)) defaults.update(kwargs) self._state = KazooState.LOST self._zookeeper_client = KazooClient(**defaults) self._zookeeper_client.add_listener(self._state_tracker) watchable = self._zookeeper_client.start_async() watchable.wait(30) if not self._zookeeper_client.connected: err = "Could not connect to Zookeeper server(s) %(server_list)s" % defaults raise ResponseError(err) @require_state(KazooState.CONNECTED) @record_stats def ensure_path(self,path,watcher=None): self._zookeeper_client.ensure_path(path,watcher) def _state_tracker(self,state): self._state = state def __del__(self): if isinstance(self._zookeeper_client, KazooClient): self._zookeeper_client.stop()
class NodeMonitor: STATIC_NODE_ID = 0 global t def __init__(self): self.zk = None self.SERVER_IP_AND_PORT = "localhost:2181" self.NODE_ID = str(NodeMonitor.STATIC_NODE_ID) NodeMonitor.STATIC_NODE_ID += 1 def start_zk(self): self.zk = KazooClient(hosts=self.SERVER_IP_AND_PORT) self.zk.add_listener(self._connection_listener) self.zk.start(); self.zk.ensure_path("/monitorData/"+ self.NODE_ID) def start_update_info(self): t = threading.Timer(0.0, self._update_info) t.start() def _update_info_once(self): cmi = CollectMachineInfo() async_obj = self.zk.set_async("/monitorData/"+ self.NODE_ID, (cmi.collectInfo()).encode(encoding="utf-8")) async_obj.rawlink(self._update_info_callback) def _connection_listener(self, state): if state == KazooState.LOST: print "connection lost, going to connect again" self.start_zk(); elif state == KazooState.SUSPENDED: print "suspended" else: print "connected ok" def _update_info_callback(self, async_obj): try: print "update success" except (ConnectionLossException, NoAuthException): print "exception!" def _update_info(self): print "begin to update" self._update_info_once() t = threading.Timer(5.0, self._update_info) t.start()
def __setupMetaServerConnection(self): keeperHosts = config.get('server', 'meta') parsed = urlparse(keeperHosts) if parsed.scheme != 'zk': raise ValueError("Meta URL must start with zk://") if parsed.path in ('/', ''): raise ValueError("Service root path not found.") self.rootpath = parsed.path.rstrip('/') # NOTE: currently, auth_data is not supported servers = parsed.netloc.split('@')[-1] metaClient = KazooClient(hosts=servers, handler=SequentialGeventHandler()) metaClient.add_listener(self.__connection) return metaClient
def run(): replication_factor = 3 zookeeper_connect_string = os.getenv('ZOOKEEPER_CONN_STRING') logging.info("waiting for kafka to start up") if os.getenv('WAIT_FOR_KAFKA') != 'no': wait_for_kafka_startup.run(get_own_ip()) else: sleep(10) logging.info("kafka port is open, continuing") zk = KazooClient(hosts=zookeeper_connect_string) zk.start() zk.add_listener(state_listener) logging.info("connected to Zookeeper") zk_dict = get_zk_dict(zk) result = generate_json(zk_dict, replication_factor, broken_topics=True) if result != {}: logging.info("JSON generated") logging.info("there are " + str(len(result['partitions'])) + " partitions to repair") logging.debug(result) if os.getenv('WRITE_TO_JSON') != 'no': write_json_to_zk(zk, result) else: logging.info("no JSON generated") needed = True for broker in zk_dict['broker']: if int(get_broker_weight(zk_dict, {'partitions': []}, broker)) == 0: needed = True if needed is True: result = generate_json(zk_dict, replication_factor, broken_topics=False) if result != {}: logging.info("JSON generated") if os.getenv('WRITE_TO_JSON') != 'no': write_json_to_zk(zk, result) else: logging.info("no unused Broker found") zk.stop() logging.info("exiting")
def _get_cluster_topology(cluster): zk = KazooClient(hosts=cluster['zk_host_ports']) zk.add_listener(my_listener) zk.start() brokers = _get_brokers(zk,cluster['id']) consumer_groups = _get_consumer_groups(zk,cluster['id']) consumer_groups_status = {} # 0 = offline, (not 0) = online for consumer_group in consumer_groups: consumers_path = cluster['consumers_path'] + "/" + consumer_group + "/ids" try: consumers = zk.get_children(consumers_path) except NoNodeError: consumer_groups_status[consumer_group]=0 # 0 = offline else: consumer_groups_status[consumer_group]=len(consumers) # (not 0) = online cluster_topology = {'cluster':cluster,'brokers':brokers,'consumer_groups':consumer_groups, 'consumer_groups_status':consumer_groups_status} zk.stop() return cluster_topology
class ZkServiceRegister: def __init__(self, zk_address, zk_timeout): self.__zkClient = KazooClient(hosts=zk_address, timeout=zk_timeout, read_only=False) self.__zkListener = ZkServiceRegisterListener(self.__zkClient) self.__zkClient.add_listener(self.__zkListener) self.__zkClient.start() def register(self, path, host, port, weight=DEFAULT_HOST_WEIGHT): try: if not self.__zkClient.exists(path): self.__zkClient.ensure_path(path) except Exception, e: print e.message reg_path = path + '/' + host + ':' + str(port) + ':' + str(weight) if self.__zkClient.exists(reg_path): self.__zkClient.delete(reg_path) self.__zkClient.create(reg_path, value='', ephemeral=True)
class ConfigFlags(StorageBase): """ A configuration manager using ZooKeeper. For setting flags on all instances for a given product. This will default to using a locally stored cache if ZooKeeper fails to respond. Note: current live flags are limited to 1MB total. Set initial values in config.ini file as flags.foo = bar to set "foo" flag to value bar. """ #TODO: # * Break flags into separate elements instead of single JSON? # * Add fake dict to hold settings if ZK not installed/avaliable. localFlags = {} version = None def __init__(self, config, **kw): try: if 'Configurator' in type(config).__name__: config = config.get_settings() conf = config.get('flags.zk.settings') if conf is not None: conf = dict(json.loads(conf)) self.zk = KazooClient(conf) else: self.zk = KazooClient() # get a copy of the local flags. self.zk_path = config.get('flags.zk.path', '/general/config') self.zk.start() node = self.zk.exists(self.zk_path) if node is None: # Virgin install, set from the config values. self._init_zk(config) self.zk.add_listener(self._zk_listener) self._refreshCache(config=config) except Exception, e: warnings.warn("Could not connect to ZooKeeper %s" % repr(e))
def init_app(self, app): """ Read kazoo settings from app configuration, setup kazoo client for application :param app: Flask application instance. """ app.config.setdefault('KAZOO_HOSTS', '127.0.0.1:2181') app.config.setdefault('KAZOO_START_TIMEOUT', 3) app.config.setdefault('KAZOO_START_BLOCKING', False) app.config.setdefault('KAZOO_SESSION_TIMEOUT', 10.0) # kazoo default app.config.setdefault('KAZOO_DEFAULT_RETRY', True) app.config.setdefault('KAZOO_RETRY_MAX_DELAY_SECONDS', 60 * 60) # kazoo default of 1hr. # Put cqlengine to application extensions if not 'kazoo' in app.extensions: app.extensions['kazoo'] = {} # Initialize connection and store it to extensions if app.config['KAZOO_DEFAULT_RETRY']: retry_kwargs = { 'max_delay': app.config['KAZOO_RETRY_MAX_DELAY_SECONDS'] } else: retry_kwargs = None kazoo_client = KazooClient(hosts=app.config['KAZOO_HOSTS'], timeout=app.config['KAZOO_SESSION_TIMEOUT'], connection_retry=retry_kwargs, command_retry=retry_kwargs) if app.config['KAZOO_START_BLOCKING']: kazoo_client.start(app.config['KAZOO_START_TIMEOUT']) else: kazoo_client.start_async() kazoo_client.add_listener(self.connection_state_listener) app.extensions['kazoo']['client'] = kazoo_client
class TestSessions(unittest.TestCase): def setUp(self): from kazoo.client import KazooClient from kazoo.protocol.states import KazooState from kazoo.testing.common import ZookeeperCluster ZK_HOME = os.environ.get("ZOOKEEPER_PATH") ZK_CLASSPATH = os.environ.get("ZOOKEEPER_CLASSPATH") self.cluster = ZookeeperCluster(ZK_HOME, size=1, port_offset=21000, classpath=ZK_CLASSPATH) self.cluster.start() atexit.register(lambda cluster: self.cluster.terminate(), self.cluster) self.client = KazooClient(self.cluster[0].address, max_retries=5) self.ev = threading.Event() def back(state): if state == KazooState.CONNECTED: self.ev.set() self.client.start() self.path = self.client.create(uuid.uuid4().hex) self.client.add_listener(back) def test_restarted_server(self): raise SkipTest('Patch missing') self.cluster.stop() self.cluster.start() self.ev.wait(5) eq_(self.ev.is_set(), True) self.assertTrue(self.client.retry(self.client.exists, self.path)) def test_terminated_server(self): raise SkipTest('Patch missing') self.cluster.reset() self.cluster.start() self.ev.wait(5) eq_(self.ev.is_set(), True) self.assertFalse(self.client.retry(self.client.exists, self.path)) def tearDown(self): self.ev.clear() self.client.stop() self.cluster.stop()
class ZkServiceProvider: def __init__(self, zk_address, zk_timeout, connection): self.__service_dict = {} self.__zk_address = zk_address self.__zkClient = KazooClient(hosts=zk_address, timeout=zk_timeout, read_only=True) self.__zkClient.start() self.__zkListener = ZkServiceProviderListener(self.__zkClient) self.__zkClient.add_listener(self.__zkListener) self.__connection = connection def register_service(self, service, zk_path, client_cls): self.__service_dict[service] = (zk_path, client_cls) result = self._register_watcher(service, zk_path, client_cls) return result def _register_watcher(self, service, zk_path, client_cls): @self.__zkClient.ChildrenWatch(zk_path) def child_changed(data): print '+++++++++++++++' + service + ' child changed.++++++++++++++++++' print data hosts = data self.__connection.update_service(service, hosts) isExists = self.__zkClient.exists(zk_path) if not isExists: return False try: hosts = self.__zkClient.get_children(zk_path) except NoNodeError: print 'no node for the path of ' + zk_path return False except: print 'other exceptions.' return False self.__connection.update_service(service, hosts) return True def stop(self): self.__zkClient.stop()
def init(): global inited zk = None try: zk = KazooClient(hosts='127.0.0.1:2181') zk.add_listener(state_listener) zk.start() register(stop_zk, zk) create_path_if_not_exists(zk, '/jobs') create_path_if_not_exists(zk, '/watchers') create_path_if_not_exists(zk, '/watchlocks') create_path_if_not_exists(zk, '/executors') except Exception as e: print 'Zk problem ', e if zk is not None: zk.stop() sys.exit(1) inited = True return zk
class ClusterMonitor(threading.Thread): """periodically checks cluster member. This class is delegated to change state between ACT clustered and ACT declustered.""" def __init__(self, zha): threading.Thread.__init__(self) self.zha = zha self.should_run = True self.zk = KazooClient(hosts=self.zha.config.get("connection_string","127.0.0.1:2181"), logger=logger) self.zk.add_listener(self._zk_listener) self.zk.start() self.zroot = self.zha.config.get("cluster_znode","/zha-state") self.znode = self.zroot + "/" + self.zha.config.get("id") self._zk_register(first=True) self.not_alone = None def run(self): while self.should_run: time.sleep(self.zha.config.get("clustercheck_interval",3)) self.zha.recheck() self._zk_register() self.check_cluster() self.trigger() if self.zha.is_clustered: self.zha.config.become_declustered() self.zha.is_clustered = False self.zk.delete(self.znode) logger.info("cluster monitor thread stopped.") def check_cluster(self): try: count = 0 chs = self.zk.get_children(self.zroot) for ch in chs: data, stats = self.zk.get(self.zroot+"/"+ch) if data.strip()=="SBY:HEALTHY" and ch != self.zha.config.get("id"): count += 1 if count != 0: self.not_alone = time.time() logger.debug("healthy sbys: %d"%(count,)) except Exception,e: logger.warn("check cluster failed. Try next time.%s"%e)
def _get_topology(): topology = CLUSTERS.get() clusters = [] for cluster in topology: zk = KazooClient(hosts=CLUSTERS[cluster].ZK_HOST_PORTS.get()) zk.add_listener(my_listener) zk.start() brokers = _get_brokers(zk,cluster) consumer_groups = _get_consumer_groups(zk,cluster) consumer_groups_status = {} # 0 = offline, (not 0) = online for consumer_group in consumer_groups: consumers_path = CLUSTERS[cluster].CONSUMERS_PATH.get() + "/" + consumer_group + "/ids" try: consumers = zk.get_children(consumers_path) except NoNodeError: consumer_groups_status[consumer_group]=0 # 0 = offline else: consumer_groups_status[consumer_group]=len(consumers) # (not 0) = online c = {'cluster':get_cluster_or_404(id=cluster),'brokers':brokers,'consumer_groups':consumer_groups,'consumer_groups_status':consumer_groups_status} clusters.append(c) zk.stop() return clusters
class ZkStateManager(StateManager): """ State manager which connects to zookeeper and gets and sets states from there. """ def __init__(self, name, host, port, rootpath, tunnelhost): self.name = name self.host = host self.port = port self.tunnelhost = tunnelhost self.rootpath = rootpath def start(self): if self.is_host_port_reachable(): self.client = KazooClient(self.hostport) else: localport = self.establish_ssh_tunnel() self.client = KazooClient("localhost:" + str(localport)) self.client.start() def on_connection_change(state): LOG.info("Connection state changed to: " + state) self.client.add_listener(on_connection_change) def stop(self): self.client.stop() self.terminate_ssh_tunnel() def get_topologies(self, callback=None): isWatching = False # Temp dict used to return result # if callback is not provided. ret = { "result": None } if callback: isWatching = True else: # Custom callback to get the topologies # right now. def callback(data): ret["result"] = data self._get_topologies_with_watch(callback, isWatching) # The topologies are now populated with the data. return ret["result"] def _get_topologies_with_watch(self, callback, isWatching): """ Helper function to get topologies with a callback. The future watch is placed only if isWatching is True. """ path = self.get_topologies_path() if isWatching: LOG.info("Adding children watch for path: " + path) @self.client.ChildrenWatch(path) def watch_topologies(topologies): callback(topologies) # Returning False will result in no future watches # being triggered. If isWatching is True, then # the future watches will be triggered. return isWatching def get_topology(self, topologyName, callback=None): isWatching = False # Temp dict used to return result # if callback is not provided. ret = { "result": None } if callback: isWatching = True else: # Custom callback to get the topologies # right now. def callback(data): ret["result"] = data self._get_topology_with_watch(topologyName, callback, isWatching) # The topologies are now populated with the data. return ret["result"] def _get_topology_with_watch(self, topologyName, callback, isWatching): """ Helper function to get pplan with a callback. The future watch is placed only if isWatching is True. """ path = self.get_topology_path(topologyName) if isWatching: LOG.info("Adding data watch for path: " + path) @self.client.DataWatch(path) def watch_topology(data, stats): if data: topology = Topology() topology.ParseFromString(data) callback(topology) else: callback(None) # Returning False will result in no future watches # being triggered. If isWatching is True, then # the future watches will be triggered. return isWatching def create_topology(self, topologyName, topology): if not topology or not topology.IsInitialized(): raise StateException("Topology protobuf not init properly", StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2] path = self.get_topology_path(topologyName) LOG.info("Adding topology: {0} to path: {1}".format( topologyName, path)) topologyString = topology.SerializeToString() try: self.client.create(path, value=topologyString, makepath=True) return True except NoNodeError as e: raise StateException("NoNodeError while creating topology", StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2] except NodeExistsError as e: raise StateException("NodeExistsError while creating topology", StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2] except ZookeeperError as e: raise StateException("Zookeeper while creating topology", StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2] except Exception as e: # Just re raise the exception. raise def delete_topology(self, topologyName): path = self.get_topology_path(topologyName) LOG.info("Removing topology: {0} from path: {1}".format( topologyName, path)) try: self.client.delete(path) return True except NoNodeError as e: raise StateException("NoNodeError while deteling topology", StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2] except NotEmptyError as e: raise StateException("NotEmptyError while deleting topology", StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2] except ZookeeperError as e: raise StateException("Zookeeper while deleting topology", StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2] except Exception as e: # Just re raise the exception. raise def get_pplan(self, topologyName, callback=None): isWatching = False # Temp dict used to return result # if callback is not provided. ret = { "result": None } if callback: isWatching = True else: # Custom callback to get the topologies # right now. def callback(data): ret["result"] = data self._get_pplan_with_watch(topologyName, callback, isWatching) # The topologies are now populated with the data. return ret["result"] def _get_pplan_with_watch(self, topologyName, callback, isWatching): """ Helper function to get pplan with a callback. The future watch is placed only if isWatching is True. """ path = self.get_pplan_path(topologyName) if isWatching: LOG.info("Adding data watch for path: " + path) @self.client.DataWatch(path) def watch_pplan(data, stats): if data: pplan = PhysicalPlan() pplan.ParseFromString(data) callback(pplan) else: callback(None) # Returning False will result in no future watches # being triggered. If isWatching is True, then # the future watches will be triggered. return isWatching def create_pplan(self, topologyName, pplan): if not pplan or not pplan.IsInitialized(): raise StateException("Physical Plan protobuf not init properly", StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2] path = self.get_pplan_path(topologyName) LOG.info("Adding topology: {0} to path: {1}".format( topologyName, path)) pplanString = pplan.SerializeToString() try: self.client.create(path, value=pplanString, makepath=True) return True except NoNodeError as e: raise StateException("NoNodeError while creating pplan", StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2] except NodeExistsError as e: raise StateException("NodeExistsError while creating pplan", StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2] except ZookeeperError as e: raise StateException("Zookeeper while creating pplan", StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2] except Exception as e: # Just re raise the exception. raise def delete_pplan(self, topologyName): path = self.get_pplan_path(topologyName) LOG.info("Removing topology: {0} from path: {1}".format( topologyName, path)) try: self.client.delete(path) return True except NoNodeError as e: raise StateException("NoNodeError while deleting pplan", StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2] except NotEmptyError as e: raise StateException("NotEmptyError while deleting pplan", StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2] except ZookeeperError as e: raise StateException("Zookeeper while deleting pplan", StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2] except Exception as e: # Just re raise the exception. raise def get_execution_state(self, topologyName, callback=None): isWatching = False # Temp dict used to return result # if callback is not provided. ret = { "result": None } if callback: isWatching = True else: # Custom callback to get the topologies # right now. def callback(data): ret["result"] = data self._get_execution_state_with_watch(topologyName, callback, isWatching) # The topologies are now populated with the data. return ret["result"] def _get_execution_state_with_watch(self, topologyName, callback, isWatching): """ Helper function to get execution state with a callback. The future watch is placed only if isWatching is True. """ path = self.get_execution_state_path(topologyName) if isWatching: LOG.info("Adding data watch for path: " + path) @self.client.DataWatch(path) def watch_execution_state(data, stats): if data: executionState = ExecutionState() executionState.ParseFromString(data) callback(executionState) else: callback(None) # Returning False will result in no future watches # being triggered. If isWatching is True, then # the future watches will be triggered. return isWatching def create_execution_state(self, topologyName, executionState): if not executionState or not executionState.IsInitialized(): raise StateException("Execution State protobuf not init properly", StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2] path = self.get_execution_state_path(topologyName) LOG.info("Adding topology: {0} to path: {1}".format( topologyName, path)) executionStateString = executionState.SerializeToString() try: self.client.create(path, value=executionStateString, makepath=True) return True except NoNodeError as e: raise StateException("NoNodeError while creating execution state", StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2] except NodeExistsError as e: raise StateException("NodeExistsError while creating execution state", StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2] except ZookeeperError as e: raise StateException("Zookeeper while creating execution state", StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2] except Exception as e: # Just re raise the exception. raise def delete_execution_state(self, topologyName): path = self.get_execution_state_path(topologyName) LOG.info("Removing topology: {0} from path: {1}".format( topologyName, path)) try: self.client.delete(path) return True except NoNodeError as e: raise StateException("NoNodeError while deleting execution state", StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2] except NotEmptyError as e: raise StateException("NotEmptyError while deleting execution state", StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2] except ZookeeperError as e: raise StateException("Zookeeper while deleting execution state", StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2] except Exception as e: # Just re raise the exception. raise def get_tmaster(self, topologyName, callback=None): isWatching = False # Temp dict used to return result # if callback is not provided. ret = { "result": None } if callback: isWatching = True else: # Custom callback to get the topologies # right now. def callback(data): ret["result"] = data self._get_tmaster_with_watch(topologyName, callback, isWatching) # The topologies are now populated with the data. return ret["result"] def _get_tmaster_with_watch(self, topologyName, callback, isWatching): """ Helper function to get pplan with a callback. The future watch is placed only if isWatching is True. """ path = self.get_tmaster_path(topologyName) if isWatching: LOG.info("Adding data watch for path: " + path) @self.client.DataWatch(path) def watch_tmaster(data, stats): if data: tmaster = TMasterLocation() tmaster.ParseFromString(data) callback(tmaster) else: callback(None) # Returning False will result in no future watches # being triggered. If isWatching is True, then # the future watches will be triggered. return isWatching def get_scheduler_location(self, topologyName, callback=None): isWatching = False # Temp dict used to return result # if callback is not provided. ret = { "result": None } if callback: isWatching = True else: # Custom callback to get the scheduler location # right now. def callback(data): ret["result"] = data self._get_scheduler_location_with_watch(topologyName, callback, isWatching) return ret["result"] def _get_scheduler_location_with_watch(self, topologyName, callback, isWatching): """ Helper function to get scheduler location with a callback. The future watch is placed only if isWatching is True. """ path = self.get_scheduler_location_path(topologyName) if isWatching: LOG.info("Adding data watch for path: " + path) @self.client.DataWatch(path) def watch_scheduler_location(data, stats): if data: scheduler_location = SchedulerLocation() scheduler_location.ParseFromString(data) callback(scheduler_location) else: callback(None) # Returning False will result in no future watches # being triggered. If isWatching is True, then # the future watches will be triggered. return isWatching
class ZookeeperRegistry(Registry): _app_config = ApplicationConfig('default_app') _connect_state = 'UNCONNECT' def __init__(self, zk_hosts, application_config=None): Registry.__init__(self) if application_config: self._app_config = application_config self.__zk = KazooClient(hosts=zk_hosts) self.__zk.add_listener(self.__state_listener) self.__zk.start() def __state_listener(self, state): if state == KazooState.LOST: # Register somewhere that the session was lost self._connect_state = state elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper # print 'disconnect from zookeeper' self._connect_state = state else: # Handle being connected/reconnected to Zookeeper # print 'connected' self._connect_state = state def __unquote(self, origin_nodes): return (urllib.parse.unquote(child_node) for child_node in origin_nodes if child_node) def _do_event(self, event): # event.path 是类似/dubbo/com.ofpay.demo.api.UserProvider/providers 这样的 # 如果要删除,必须先把/dubbo/和最后的/providers去掉 # 将zookeeper中查询到的服务节点列表加入到一个dict中 # zookeeper中保持的节点url类似如下 logger.info("receive event is {0}, event state is {1}".format( event, event.state)) provide_name = event.path[7:event.path.rfind('/')] if event.state in ['CONNECTED', 'DELETED']: children = self.__zk.get_children(event.path, watch=self.event_listener) self._compare_swap_nodes(provide_name, self.__unquote(children)) configurators_nodes = self._get_provider_configuration( provide_name) self._set_provider_configuration(provide_name, configurators_nodes) # print(self._service_providers) def _do_config_event(self, event): """ zk的目录路径为 /dubbo/com.qianmi.pc.api.es.item.EsGoodsQueryProvider/configurators :param event: :return: """ logger.info("receive config event is {0}, event state is {1}".format( event, event.state)) provide_name = event.path[7:event.path.rfind('/')] configurators_nodes = self._get_provider_configuration(provide_name) self._set_provider_configuration(provide_name, configurators_nodes) # print(self._service_providers) def register(self, interface, **kwargs): ip = self.__zk._connection._socket.getsockname()[0] params = { 'interface': interface, 'application': self._app_config.name, 'application.version': self._app_config.version, 'category': 'consumer', 'dubbo': 'dubbo-client-py-1.0.1', 'environment': self._app_config.environment, 'method': '', 'owner': self._app_config.owner, 'side': 'consumer', 'pid': os.getpid(), 'version': '1.0' } url = 'consumer://{0}/{1}?{2}'.format(ip, interface, urllib.parse.urlencode(params)) # print urllib.quote(url, safe='') consumer_path = '{0}/{1}/{2}'.format('dubbo', interface, 'consumers') self.__zk.ensure_path(consumer_path) self.__zk.create(consumer_path + '/' + urllib.parse.quote(url, safe=''), ephemeral=True) def subscribe(self, interface, **kwargs): """ 监听注册中心的服务上下线 :param interface: 类似com.ofpay.demo.api.UserProvider这样的服务名 :return: 无返回 """ version = kwargs.get('version', '') group = kwargs.get('group', '') providers_children = self.__zk.get_children('{0}/{1}/{2}'.format( 'dubbo', interface, 'providers'), watch=self.event_listener) logger.debug("watch node is {0}".format(providers_children)) self.__zk.get_children('{0}/{1}/{2}'.format('dubbo', interface, 'configurators'), watch=self.configuration_listener) # 全部重新添加 self._compare_swap_nodes(interface, self.__unquote(providers_children)) configurators_nodes = self._get_provider_configuration(interface) self._set_provider_configuration(interface, configurators_nodes) def _get_provider_configuration(self, interface): """ 获取dubbo自定义配置数据,从"/dubbo/{interface}/configurators" 路径下获取配置 :param interface: :return: """ try: configurators_nodes = self.__zk.get_children( '{0}/{1}/{2}'.format('dubbo', interface, 'configurators'), watch=self.configuration_listener) logger.debug( "configurators node is {0}".format(configurators_nodes)) return self.__unquote(configurators_nodes) except Exception as e: logger.warn("get provider %s configuration error %s", interface, str(e))
class ZooKeeper(AbstractDCS): def __init__(self, config): super(ZooKeeper, self).__init__(config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) self._client = KazooClient(hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']), timeout=config['ttl'], connection_retry=KazooRetry(max_delay=1, max_tries=-1, sleep_func=time.sleep), command_retry=KazooRetry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1, sleep_func=time.sleep)) self._client.add_listener(self.session_listener) self._fetch_cluster = True self._orig_kazoo_connect = self._client._connection._connect self._client._connection._connect = self._kazoo_connect self._client.start() def _kazoo_connect(self, host, port): """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no response on Ping after Ping interval (1/2 from read_timeout) it will consider current connection dead and try to connect to another node. Without this "magic" it was taking up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had only small time for reconnect and retry. This method is needed to return different value of read_timeout, which is not calculated from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and `write_leader_optime()` methods, which also may hang...""" ret = self._orig_kazoo_connect(host, port) return max(self.loop_wait - 2, 2)*1000, ret[1] def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def cluster_watcher(self, event): self._fetch_cluster = True self.event.set() def reload_config(self, config): self.set_retry_timeout(config['retry_timeout']) loop_wait = config['loop_wait'] loop_wait_changed = self._loop_wait != loop_wait self._loop_wait = loop_wait self._client.handler.set_connect_timeout(loop_wait) # We need to reestablish connection to zookeeper if we want to change # read_timeout (and Ping interval respectively), because read_timeout # is calculated in `_kazoo_connect` method. If we are changing ttl at # the same time, set_ttl method will reestablish connection and return # `!True`, otherwise we will close existing connection and let kazoo # open the new one. if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed: self._client._connection._socket.close() def set_ttl(self, ttl): """It is not possible to change ttl (session_timeout) in zookeeper without destroying old session and creating the new one. This method returns `!True` if session_timeout has been changed (`restart()` has been called).""" if self._client._session_timeout != ttl: self._client._session_timeout = ttl self._client.restart() return True @property def ttl(self): return self._client._session_timeout def set_retry_timeout(self, retry_timeout): retry = self._client.retry if isinstance(self._client.retry, KazooRetry) else self._client._retry retry.deadline = retry_timeout def get_node(self, key, watch=None): try: ret = self._client.get(key, watch) return (ret[0].decode('utf-8'), ret[1]) except NoNodeError: return None @staticmethod def member(name, value, znode): return Member.from_node(znode.version, name, znode.ephemeralOwner, value) def get_children(self, key, watch=None): try: return self._client.get_children(key, watch) except NoNodeError: return [] def load_members(self, sync_standby): members = [] for member in self.get_children(self.members_path, self.cluster_watcher): watch = member == sync_standby and self.cluster_watcher or None data = self.get_node(self.members_path + member, watch) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set(self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get global dynamic configuration config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid) # get timeline history history = self.get_node(self.history_path, watch=self.cluster_watcher) if self._HISTORY in nodes else None history = history and TimelineHistory.from_node(history[1].mzxid, history[0]) # get last leader operation last_leader_operation = self._OPTIME in nodes and self._fetch_cluster and self.get_node(self.leader_optime_path) last_leader_operation = last_leader_operation and int(last_leader_operation[0]) or 0 # get synchronization state sync = self.get_node(self.sync_path, watch=self.cluster_watcher) if self._SYNC in nodes else None sync = SyncState.from_node(sync and sync[1].version, sync and sync[0]) # get list of members sync_standby = sync.leader == self._name and sync.sync_standby or None members = self.load_members(sync_standby) if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node(self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if not self._ctl and leader[0] == self._name and client_id is not None \ and client_id[0] != leader[1].ephemeralOwner: logger.info('I am leader but not owner of the session. Removing leader node') self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # failover key failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None failover = failover and Failover.from_node(failover[1].version, failover[0]) return Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history) def _load_cluster(self): cluster = self.cluster if self._fetch_cluster or cluster is None: try: cluster = self._client.retry(self._inner_load_cluster) except Exception: logger.exception('get_cluster') self.cluster_watcher(None) raise ZooKeeperError('ZooKeeper in not responding properly') return cluster def _create(self, path, value, retry=False, ephemeral=False): try: if retry: self._client.retry(self._client.create, path, value, makepath=True, ephemeral=ephemeral) else: self._client.create_async(path, value, makepath=True, ephemeral=ephemeral).get(timeout=1) return True except Exception: logger.exception('Failed to create %s', path) return False def attempt_to_acquire_leader(self, permanent=False): ret = self._create(self.leader_path, self._name.encode('utf-8'), retry=True, ephemeral=not permanent) if not ret: logger.info('Could not take out TTL lock') return ret def _set_or_create(self, key, value, index=None, retry=False, do_not_create_empty=False): value = value.encode('utf-8') try: if retry: self._client.retry(self._client.set, key, value, version=index or -1) else: self._client.set_async(key, value, version=index or -1).get(timeout=1) return True except NoNodeError: if do_not_create_empty and not value: return True elif index is None: return self._create(key, value, retry) else: return False except Exception: logger.exception('Failed to update %s', key) return False def set_failover_value(self, value, index=None): return self._set_or_create(self.failover_path, value, index) def set_config_value(self, value, index=None): return self._set_or_create(self.config_path, value, index, retry=True) def initialize(self, create_new=True, sysid=""): sysid = sysid.encode('utf-8') return self._create(self.initialize_path, sysid, retry=True) if create_new \ else self._client.retry(self._client.set, self.initialize_path, sysid) def touch_member(self, data, permanent=False): cluster = self.cluster member = cluster and cluster.get_member(self._name, fallback_to_leader=False) encoded_data = json.dumps(data, separators=(',', ':')).encode('utf-8') if member and (self._client.client_id is not None and member.session != self._client.client_id[0] or not (deep_compare(member.data.get('tags', {}), data.get('tags', {})) and member.data.get('version') == data.get('version') and member.data.get('checkpoint_after_promote') == data.get('checkpoint_after_promote'))): try: self._client.delete_async(self.member_path).get(timeout=1) except NoNodeError: pass except Exception: return False member = None if member: if deep_compare(data, member.data): return True else: try: self._client.create_async(self.member_path, encoded_data, makepath=True, ephemeral=not permanent).get(timeout=1) return True except Exception as e: if not isinstance(e, NodeExistsError): logger.exception('touch_member') return False try: self._client.set_async(self.member_path, encoded_data).get(timeout=1) return True except Exception: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def _write_leader_optime(self, last_operation): return self._set_or_create(self.leader_optime_path, last_operation) def _update_leader(self): return True def delete_leader(self): self._client.restart() return True def _cancel_initialization(self): node = self.get_node(self.initialize_path) if node: self._client.delete(self.initialize_path, version=node[1].version) def cancel_initialization(self): try: self._client.retry(self._cancel_initialization) except Exception: logger.exception("Unable to delete initialize key") def delete_cluster(self): try: return self._client.retry(self._client.delete, self.client_path(''), recursive=True) except NoNodeError: return True def set_history_value(self, value): return self._set_or_create(self.history_path, value) def set_sync_state_value(self, value, index=None): return self._set_or_create(self.sync_path, value, index, retry=True, do_not_create_empty=True) def delete_sync_state(self, index=None): return self.set_sync_state_value("{}", index) def watch(self, leader_index, timeout): if super(ZooKeeper, self).watch(leader_index, timeout): self._fetch_cluster = True return self._fetch_cluster
class ZooKeeper(object): log = logging.getLogger("OpenLabCMD.ZooKeeper") # Log zookeeper retry every 10 seconds retry_log_rate = 10 def __init__(self, config=None): """ Zookeeper Client for OpenLab HA management. :param config: The config object. :type: configparser.ConfigParser. """ self.client = None self.config = config if self.config and not isinstance(self.config, configparser.ConfigParser): raise exceptions.ClientError("config should be a ConfigParser " "object.") self._last_retry_log = 0 def _connection_listener(self, state): if state == KazooState.LOST: self.log.debug("ZooKeeper connection: LOST") elif state == KazooState.SUSPENDED: self.log.debug("ZooKeeper connection: SUSPENDED") else: self.log.debug("ZooKeeper connection: CONNECTED") def logConnectionRetryEvent(self): now = time.monotonic() if now - self._last_retry_log >= self.retry_log_rate: self.log.warning("Retrying zookeeper connection") self._last_retry_log = now @property def connected(self): if self.client is None: return False return self.client.state == KazooState.CONNECTED @property def suspended(self): if self.client is None: return True return self.client.state == KazooState.SUSPENDED @property def lost(self): if self.client is None: return True return self.client.state == KazooState.LOST def connect(self, hosts=None, timeout=None, read_only=False): if not hosts: if not self.config: raise exceptions.ClientError('Either config object or hosts ' 'string should be provided.') try: hosts = hosts or self.config.get('ha', 'zookeeper_hosts') except (configparser.NoOptionError, configparser.NoSectionError): raise exceptions.ClientError( "The config doesn't contain [ha]zookeeper_hosts option.") if not timeout: timeout = self.config.get('ha', 'zookeeper_connect_timeout', fallback=5) retry_limit = self.config.get('ha', 'zookeeper_connect_retry_limit', fallback=5) try: timeout = int(timeout) except ValueError: raise exceptions.ClientError("zookeeper_connect_timeout " "should be int-like format.") if timeout <= 0: raise exceptions.ClientError("zookeeper_connect_timeout " "should be larger than 0.") if self.client is None: self.client = KazooClient(hosts=hosts, timeout=timeout, read_only=read_only) self.client.add_listener(self._connection_listener) # Manually retry initial connection attempt tried_times = 0 while tried_times < retry_limit: try: self.client.start(1) break except Exception: self.logConnectionRetryEvent() tried_times += 1 if tried_times == retry_limit: self.client = None raise exceptions.ClientError( "Tried %s times, failed connecting " "zookeeper." % retry_limit) def disconnect(self): if self.client is not None and self.client.connected: self.client.stop() self.client.close() self.client = None def _client_check_wrapper(func): def wrapper(self, *args, **kwargs): if not self.client: raise exceptions.ClientError( "Should call connect function first to initialise " "zookeeper client") return func(self, *args, **kwargs) return wrapper @_client_check_wrapper def list_nodes(self, with_zk=True, node_role_filter=None, node_type_filter=None): if node_role_filter: if isinstance(node_role_filter, str): node_role_filter = [node_role_filter] if not isinstance(node_role_filter, list): raise exceptions.ValidationError("node_role_filter should be " "a list or string.") if node_type_filter: if isinstance(node_type_filter, str): node_type_filter = [node_type_filter] if not isinstance(node_type_filter, list): raise exceptions.ValidationError("node_type_filter should be " "a list or string.") path = '/ha' try: nodes_objs = [] for exist_node in self.client.get_children(path): if exist_node == 'configuration': continue if not with_zk and 'zookeeper' in exist_node: continue node_obj = self.get_node(exist_node) if node_role_filter and node_obj.role not in node_role_filter: continue if node_type_filter and node_obj.type not in node_type_filter: continue nodes_objs.append(node_obj) except kze.NoNodeError: return [] return sorted(nodes_objs, key=lambda x: x.name) @_client_check_wrapper def get_node(self, node_name): try: node_bytes = self.client.get('/ha/%s' % node_name) node_obj = node.Node.from_zk_bytes(node_bytes) return node_obj except kze.NoNodeError: raise exceptions.ClientError('Node %s not found.' % node_name) def _init_service(self, node_name, node_type): path = '/ha/%s' % node_name master_service_path = path + '/master' slave_service_path = path + '/slave' zookeeper_service_path = path + '/zookeeper' self.client.create(master_service_path) self.client.create(slave_service_path) self.client.create(zookeeper_service_path) for node_role, all_services in service.service_mapping.items(): new_service_path = path + '/%s' % node_role try: node_services = all_services[node_type] except KeyError: continue for service_type, service_names in node_services.items(): service_class = (service.NecessaryService if service_type == 'necessary' else service.UnnecessaryService) for service_name in service_names: new_service = service_class(service_name, node_name) self.client.create( new_service_path + '/%s' % service_name, value=new_service.to_zk_bytes()) @_client_check_wrapper def create_node(self, name, role, n_type, ip): existed_nodes = self.list_nodes() for existed_node in existed_nodes: if existed_node.role == role and existed_node.role == n_type: raise exceptions.ClientError( "The role and type of the node should be unique.") path = '/ha/%s' % name new_node = node.Node(name, role, n_type, ip) try: self.client.create(path, value=new_node.to_zk_bytes(), makepath=True) except kze.NodeExistsError: raise exceptions.ClientError("The node %s is already existed." % name) self._init_service(name, n_type) node_obj = self.get_node(name) return node_obj @_client_check_wrapper def update_node(self, node_name, maintain=None, role=None, **kwargs): path = '/ha/%s' % node_name node_obj = self.get_node(node_name) if maintain is not None: if maintain: if node_obj.status == node.NodeStatus.UP: node_obj.status = node.NodeStatus.MAINTAINING else: raise exceptions.ClientError( "The node must be in 'up' status when trying to " "maintain it.") else: if node_obj.status == node.NodeStatus.MAINTAINING: node_obj.status = node.NodeStatus.UP node_obj.heartbeat = datetime.datetime.utcnow().strftime( '%Y-%m-%d %H:%M:%S') else: raise exceptions.ClientError( "The node must be in 'maintaining' status when trying " "to un-maintain it.") if role: node_obj.role = role switch_status = kwargs.get('switch_status') if switch_status is not None: if switch_status.lower() not in ['start', 'end']: raise exceptions.ClientError( "switch_status must be 'start', 'end'") node_obj.update(kwargs) self.client.set(path, value=node_obj.to_zk_bytes()) node_obj = self.get_node(node_name) return node_obj @_client_check_wrapper def delete_node(self, node_name): self.get_node(node_name) path = '/ha/%s' % node_name self.client.delete(path, recursive=True) @_client_check_wrapper def list_services(self, node_name_filter=None, node_role_filter=None, status_filter=None): """ List the services in the HA deployment. :param node_name_filter: The node filter. :type node_name_filter: list or string. :param node_role_filter: The node filter. :type node_role_filter: list or string. :param status_filter: The status filter. :type status_filter: list or string. :return: the services list. """ if node_name_filter: if isinstance(node_name_filter, str): node_name_filter = [node_name_filter] if not isinstance(node_name_filter, list): raise exceptions.ValidationError("node_name_filter should be " "a list or string.") if node_role_filter: if isinstance(node_role_filter, str): node_role_filter = [node_role_filter] if not isinstance(node_role_filter, list): raise exceptions.ValidationError("node_role_filter should be " "a list or string.") if status_filter: if isinstance(status_filter, str): status_filter = [status_filter] if not isinstance(status_filter, list): raise exceptions.ValidationError("status_filter should be " "a list or string.") result = [] for exist_node in self.list_nodes(): if node_name_filter and exist_node.name not in node_name_filter: continue if node_role_filter and exist_node.role not in node_role_filter: continue path = '/ha/%s/%s' % (exist_node.name, exist_node.role) for service_name in self.client.get_children(path): service_path = path + '/' + service_name service_bytes = self.client.get(service_path) service_obj = service.Service.from_zk_bytes(service_bytes) if status_filter and service_obj.status not in status_filter: continue result.append(service_obj) return sorted(result, key=lambda x: x.node_name) @_client_check_wrapper def get_service(self, service_name, node_name): service_node = self.get_node(node_name) path = '/ha/%s/%s/%s' % (service_node.name, service_node.role, service_name) try: service_bytes = self.client.get(path) except kze.NoNodeError: raise exceptions.ClientError('Service %s not found.' % service_name) service_obj = service.Service.from_zk_bytes(service_bytes) return service_obj @_client_check_wrapper def update_service(self, service_name, node_name, alarmed=None, restarted=None, status=None, **kwargs): old_service = self.get_service(service_name, node_name) service_node = self.get_node(node_name) path = '/ha/%s/%s/%s' % (service_node.name, service_node.role, service_name) current_time = datetime.datetime.utcnow().isoformat() if alarmed is not None: if not isinstance(alarmed, bool): raise exceptions.ValidationError('alarmed should be boolean ' 'value.') old_service.alarmed = alarmed if alarmed: old_service.alarmed_at = current_time if restarted is not None: if not isinstance(restarted, bool): raise exceptions.ValidationError('restarted should be ' 'boolean value.') old_service.restarted = restarted if restarted: old_service.restarted_at = current_time if status: if status not in service.ServiceStatus().all_status: raise exceptions.ValidationError( 'status should be in %s.' % service.ServiceStatus().all_status) old_service.status = status old_service.update(kwargs) self.client.set(path, value=old_service.to_zk_bytes()) new_service = self.get_service(service_name, node_name) return new_service @_client_check_wrapper def switch_master_and_slave(self): """Mark node's switch status to start. This func is called by labkeeper deploy tool. So that operators can switch master-slave role by hand. Once health checker find that all nodes' switch status are `start`, it will start to switch cluster. """ for node in self.list_nodes(): if node.type != 'zookeeper': self.update_node(node.name, switch_status='start') @_client_check_wrapper def check_and_repair_deployment_sg(self, is_dry_run=False): """Check and Repair current HA deployment Security Group configuration This func is called by labkeeper deploy tool. So that operators can check and repair exist deployment from zookeeper. The function is for checking Cloud Security Group configuration. """ deploy_map = {} cloud_provide_rules = {} unexpect_rules = {} for node in self.list_nodes(): ha_ports_cp = copy.deepcopy(constants.HA_PORTS) if node.type == 'nodepool': ha_ports_cp.remove(constants.MYSQL_HA_PORT) elif node.type == 'zuul': for p in constants.ZOOKEEPER_HA_PORTS: ha_ports_cp.remove(p) elif node.type == 'zookeeper': ha_ports_cp.remove(constants.RSYNCD_HA_PORT) ha_ports_cp.remove(constants.MYSQL_HA_PORT) if node.name.split("-")[0] not in deploy_map: deploy_map[node.name.split("-")[0]] = {'nodes': [node]} cloud_provide_rules[node.name.split("-")[0]] = { node.ip + '/32': ha_ports_cp} else: deploy_map[node.name.split("-")[0]]['nodes'].append(node) cloud_provide_rules[node.name.split("-")[0]][ node.ip + '/32'] = ha_ports_cp # Fit current expect_rules expect_rules = {} sg_map = {} cloud_names = list(cloud_provide_rules.keys()) for cloud_name, ip_dict in cloud_provide_rules.items(): c_names = copy.deepcopy(cloud_names) c_names.remove(cloud_name) expect_rules[cloud_name] = copy.deepcopy(ip_dict) if len(cloud_provide_rules[cloud_name].keys()) > 1: for c_name in c_names: expect_rules[cloud_name].update( copy.deepcopy(cloud_provide_rules[c_name])) else: for c_name in c_names: for ip in cloud_provide_rules[c_name].keys(): if 2888 in cloud_provide_rules[c_name][ip]: zk_ha_ports = copy.deepcopy( constants.ZOOKEEPER_HA_PORTS) expect_rules[cloud_name][ip] = zk_ha_ports else: expect_rules[cloud_name][ip] = [2181] for cloud_name, nodes_dict in deploy_map.items(): net_client = os_client_config.make_rest_client( 'network', cloud=cloud_name) for sg_name in constants.HA_SGs: url = "/security-groups?name=%s" % sg_name resp = net_client.get(url) if resp.status_code != 200: raise exceptions.ClientError( 'Security group %(sg_name)s not found on ' 'cloud %(cloud_name)s.' % {'sg_name': sg_name, 'cloud_name': cloud_name}) sgr_data = resp.json()['security_groups'][0] if cloud_name not in sg_map: sg_map[cloud_name] = resp.json()[ 'security_groups'][0]['id'] for rule in sgr_data['security_group_rules']: if rule['direction'] != 'ingress': continue is_specified_1_port = ( rule['port_range_min'] == rule['port_range_max']) is_ipv4 = rule['ethertype'] == 'IPv4' is_tcp = rule['protocol'] == 'tcp' if not expect_rules[cloud_name].get( rule['remote_ip_prefix']): if cloud_name not in unexpect_rules: unexpect_rules[cloud_name] = [ (rule['remote_ip_prefix'], rule['port_range_min'], rule['id'])] else: unexpect_rules[cloud_name].append( (rule['remote_ip_prefix'], rule['port_range_min'], rule['id'])) else: if (is_specified_1_port and is_ipv4 and is_tcp and rule['port_range_min'] in expect_rules[ cloud_name][rule['remote_ip_prefix']]): expect_rules[cloud_name][ rule['remote_ip_prefix']].remove( rule['port_range_min']) if len(expect_rules[cloud_name][ rule['remote_ip_prefix']]) ==0: expect_rules[cloud_name].pop( rule['remote_ip_prefix']) else: if cloud_name not in unexpect_rules: unexpect_rules[cloud_name] = [ (rule['remote_ip_prefix'], rule['port_range_min'], rule['id'])] else: unexpect_rules[cloud_name].append(( rule['remote_ip_prefix'], rule['port_range_min'], rule['id'])) if not is_dry_run: # analysis expect_rules for cloud_name, ip_dict in expect_rules.items(): if not ip_dict: print("Cloud %s: PASSED" % cloud_name) continue print("Recover security group rules for cloud %s:" % cloud_name) # Here means the sg lacks SG_rule settings net_client = os_client_config.make_rest_client( 'network', cloud=cloud_name) for ip, ports in ip_dict.items(): req = { "security_group_rule": { "direction": "ingress", "ethertype": "IPv4", "protocol": "tcp", "security_group_id": sg_map[cloud_name], "remote_ip_prefix": ip } } for port in ports: req["security_group_rule"].update({ "port_range_min": port, "port_range_max": port }) resp = net_client.post('/security-group-rules', json=req) if resp.status_code != 201: raise exceptions.ClientError( 'Failed to create security group rule on ' 'cloud %(cloud_name)s with summary ' '%(ip)s %(port)s' % {'cloud_name': cloud_name, 'ip': ip, 'port': port}) print("Create new sg_rule, summary %(ip)s %(port)s" % { "ip": ip, "port": str(port) }) # remove unexpect sg_rules for cloud_name, ip_port_tuple_list in unexpect_rules.items(): net_client = os_client_config.make_rest_client( 'network', cloud=cloud_name) print("Unexpect security group rules clean for cloud %s:" % cloud_name) for ip_port_tuple in ip_port_tuple_list: url = "/security-group-rules/%s" % ip_port_tuple[2] resp = net_client.delete(url) if resp.status_code != 204: raise exceptions.ClientError( 'Failed to delete security group rule ' '%(rule_id)s on cloud %(cloud_name)s' % {'cloud_name': cloud_name, 'rule_id': ip_port_tuple[2]}) print("Remove sg_rule %(rule_id)s, summary %(ip)s " "%(port)s" % { "rule_id": ip_port_tuple[2], "ip": ip_port_tuple[0], "port": str(ip_port_tuple[1]) }) else: for cloud_name, ip_dict in expect_rules.items(): if not ip_dict: print("Cloud %s: PASSED" % cloud_name) continue print("Found lack security group rules in cloud %s" % cloud_name) for ip, ports in ip_dict.items(): print(" Need to create new rule for (ip)s (ports)s" % { "ip": ip, "ports": str(ports) }) # remove unexpect sg_rules for cloud_name, ip_port_tuple_list in unexpect_rules.items(): print("Found unexpect security group rules clean for " "cloud %s:" % cloud_name) for ip_port_tuple in ip_port_tuple_list: print(" Need to remove sg_rule %(rule_id)s, " "summary %(ip)s %(port)s" % { "rule_id": ip_port_tuple[2], "ip": ip_port_tuple[0], "port": str(ip_port_tuple[1]) }) def _init_ha_configuration(self): path = '/ha/configuration' self.client.create(path, value=json.dumps(CONFIGURATION_DICT).encode('utf8'), makepath=True) @_client_check_wrapper def list_configuration(self): path = '/ha/configuration' try: config_bytes = self.client.get(path) except kze.NoNodeError: self._init_ha_configuration() config_bytes = self.client.get(path) return json.loads(config_bytes[0].decode('utf8')) @_client_check_wrapper def update_configuration(self, name, value): path = '/ha/configuration' configs = self.list_configuration() if name not in configs.keys(): raise exceptions.ClientError('There is not option %s' % name) configs[name] = value self.client.set(path, json.dumps(configs).encode('utf8'))
class ZooKeeper(AbstractDCS): def __init__(self, name, config): super(ZooKeeper, self).__init__(name, config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) self.exhibitor = None if 'exhibitor' in config: exhibitor = config['exhibitor'] interval = exhibitor.get('poll_interval', 300) self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=interval) hosts = self.exhibitor.zookeeper_hosts self.client = KazooClient(hosts=hosts, timeout=(config.get('session_timeout', None) or 30), command_retry={ 'deadline': (config.get('reconnect_timeout', None) or 10), 'max_delay': 1, 'max_tries': -1 }, connection_retry={ 'max_delay': 1, 'max_tries': -1 }) self.client.add_listener(self.session_listener) self.cluster_event = self.client.handler.event_object() self.fetch_cluster = True self.members = [] self.leader = None self.last_leader_operation = 0 self.client.start(None) def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def cluster_watcher(self, event): self.fetch_cluster = True self.cluster_event.set() def get_node(self, name, watch=None): try: return self.client.get(self.client_path(name), watch) except NoNodeError: pass except: logger.exception('get_node') return None @staticmethod def member(name, value, znode): conn_url, api_url = parse_connection_string(value) return Member(znode.mzxid, name, conn_url, api_url, None, None) def load_members(self): members = [] for member in self.client.get_children(self.client_path('/members'), self.cluster_watcher): data = self.get_node('/members/' + member) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self.cluster_event.clear() leader = self.get_node('/leader', self.cluster_watcher) self.members = self.load_members() if leader: if leader[0] == self._name: client_id = self.client.client_id if client_id is not None and client_id[0] != leader[ 1].ephemeralOwner: logger.info( 'I am leader but not owner of the session. Removing leader node' ) self.client.delete(self.client_path('/leader')) leader = None if leader: for member in self.members: if member.name == leader[0]: leader = member self.fetch_cluster = False break if not isinstance(leader, Member): leader = Member(-1, leader, None, None, None, None) self.leader = leader if self.fetch_cluster: last_leader_operation = self.get_node('/optime/leader') if last_leader_operation: self.last_leader_operation = int(last_leader_operation[0]) def get_cluster(self): if self.exhibitor and self.exhibitor.poll(): self.client.set_hosts(self.exhibitor.zookeeper_hosts) if self.fetch_cluster: try: self.client.retry(self._inner_load_cluster) except: logger.exception('get_cluster') self.session_listener(KazooState.LOST) raise ZooKeeperError('ZooKeeper in not responding properly') return Cluster(True, self.leader, self.last_leader_operation, self.members) def _create(self, path, value, **kwargs): try: self.client.retry(self.client.create, self.client_path(path), value, **kwargs) return True except: return False def attempt_to_acquire_leader(self): ret = self._create('/leader', self._name, makepath=True, ephemeral=True) ret or logger.info('Could not take out TTL lock') return ret def race(self, path): return self._create(path, self._name, makepath=True) def touch_member(self, connection_string, ttl=None): for m in self.members: if m.name == self._name: return True path = self.client_path('/members/' + self._name) try: self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True) return True except NodeExistsError: try: self.client.retry(self.client.delete, path) self.client.retry(self.client.create, path, connection_string, makepath=True, ephemeral=True) return True except: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def update_leader(self, state_handler): last_operation = state_handler.last_operation() if last_operation != self.last_leader_operation: self.last_leader_operation = last_operation path = self.client_path('/optime/leader') try: self.client.retry(self.client.set, path, last_operation) except NoNodeError: try: self.client.retry(self.client.create, path, last_operation, makepath=True) except: logger.exception('Failed to create %s', path) except: logger.exception('Failed to update %s', path) return True def delete_leader(self): if isinstance(self.leader, Member) and self.leader.name == self._name: self.client.delete(self.client_path('/leader')) def sleep(self, timeout): self.cluster_event.wait(timeout) if self.cluster_event.isSet(): self.fetch_cluster = True
class AnalyticsDiscovery(gevent.Greenlet): def _sandesh_connection_info_update(self, status, message): new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER, name=self._svc_name, status=new_conn_state, server_addrs=self._zk_server.split(','), message=message) if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' % (message) self._logger.error(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._logger.error(msg) self._conn_state = new_conn_state # end _sandesh_connection_info_update def _zk_listen(self, state): self._logger.error("Analytics Discovery listen %s" % str(state)) if state == KazooState.CONNECTED: self._sandesh_connection_info_update( status='UP', message='Connection to Zookeeper re-established') self._logger.error("Analytics Discovery to publish %s" % str(self._pubinfo)) self._reconnect = True elif state == KazooState.LOST: self._logger.error("Analytics Discovery connection LOST") # Lost the session with ZooKeeper Server # Best of option we have is to exit the process and restart all # over again self._sandesh_connection_info_update( status='DOWN', message='Connection to Zookeeper lost') os._exit(2) elif state == KazooState.SUSPENDED: self._logger.error("Analytics Discovery connection SUSPENDED") # Update connection info self._sandesh_connection_info_update( status='INIT', message='Connection to zookeeper lost. Retrying') def _zk_datawatch(self, watcher, child, data, stat, event="unknown"): self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (watcher, child, data, event)) if data: data_dict = json.loads(data) self._wchildren[watcher][child] = OrderedDict( sorted(data_dict.items())) else: if child in self._wchildren[watcher]: del self._wchildren[watcher][child] if self._data_watchers[watcher]: self._pendingcb.add(watcher) def _zk_watcher(self, watcher, children): self._logger.error("Analytics Discovery Watcher %s Children %s" % (watcher, children)) self._reconnect = True def __init__(self, logger, zkservers, svc_name, inst, data_watchers={}, child_watchers={}, zpostfix="", freq=10): gevent.Greenlet.__init__(self) self._svc_name = svc_name self._inst = inst self._zk_server = zkservers # initialize logging and other stuff if logger is None: logging.basicConfig() self._logger = logging else: self._logger = logger self._conn_state = None self._sandesh_connection_info_update( status='INIT', message='Connection to Zookeeper initialized') self._zkservers = zkservers self._zk = None self._pubinfo = None self._publock = Semaphore() self._data_watchers = data_watchers self._child_watchers = child_watchers self._wchildren = {} self._pendingcb = set() self._zpostfix = zpostfix self._basepath = "/analytics-discovery-" + self._zpostfix self._reconnect = None self._freq = freq def publish(self, pubinfo): # This function can be called concurrently by the main AlarmDiscovery # processing loop as well as by clients. # It is NOT re-entrant self._publock.acquire() self._pubinfo = pubinfo if self._conn_state == ConnectionStatus.UP: try: self._logger.error("ensure %s" % (self._basepath + "/" + self._svc_name)) self._logger.error("zk state %s (%s)" % (self._zk.state, self._zk.client_state)) self._zk.ensure_path(self._basepath + "/" + self._svc_name) self._logger.error("check for %s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)) if pubinfo is not None: if self._zk.exists("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)): self._zk.set("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo) else: self._zk.create("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo, ephemeral=True) else: if self._zk.exists("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)): self._logger.error("withdrawing published info!") self._zk.delete("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)) except Exception as ex: template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._sandesh_connection_info_update( status='DOWN', message='Reconnect to Zookeeper to handle exception') self._reconnect = True else: self._logger.error("Analytics Discovery cannot publish while down") self._publock.release() def _run(self): while True: self._logger.error("Analytics Discovery zk start") self._zk = KazooClient(hosts=self._zkservers, timeout=60.0) self._zk.add_listener(self._zk_listen) try: self._zk.start() while self._conn_state != ConnectionStatus.UP: gevent.sleep(1) break except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) self._zk.remove_listener(self._zk_listen) try: self._zk.stop() self._zk.close() except Exception as ex: template = "Exception {0} in AnalyticsDiscovery zk stop/close. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s" % \ (messag, traceback.format_exc(), self._svc_name)) finally: self._zk = None gevent.sleep(1) try: # Update connection info self._sandesh_connection_info_update( status='UP', message='Connection to Zookeeper established') self._reconnect = False # Done connecting to ZooKeeper for wk in self._data_watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) self._wchildren[wk] = {} self._zk.ChildrenWatch(self._basepath + "/" + wk, partial(self._zk_watcher, wk)) for wk in self._child_watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) self._zk.ChildrenWatch(self._basepath + "/" + wk, self._child_watchers[wk]) # Trigger the initial publish self._reconnect = True while True: try: if not self._reconnect: pending_list = list(self._pendingcb) self._pendingcb = set() for wk in pending_list: if self._data_watchers[wk]: self._data_watchers[wk](\ sorted(self._wchildren[wk].values())) # If a reconnect happens during processing, don't lose it while self._reconnect: self._logger.error("Analytics Discovery %s reconnect" \ % self._svc_name) self._reconnect = False self._pendingcb = set() self.publish(self._pubinfo) for wk in self._data_watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) children = self._zk.get_children(self._basepath + "/" + wk) old_children = set(self._wchildren[wk].keys()) new_children = set(children) # Remove contents for the children who are gone # (DO NOT remove the watch) for elem in old_children - new_children: del self._wchildren[wk][elem] # Overwrite existing children, or create new ones for elem in new_children: # Create a watch for new children if elem not in self._wchildren[wk]: self._zk.DataWatch(self._basepath + "/" + \ wk + "/" + elem, partial(self._zk_datawatch, wk, elem)) data_str, _ = self._zk.get(\ self._basepath + "/" + wk + "/" + elem) data_dict = json.loads(data_str) self._wchildren[wk][elem] = \ OrderedDict(sorted(data_dict.items())) self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (wk, elem, self._wchildren[wk][elem], "GET")) if self._data_watchers[wk]: self._data_watchers[wk](sorted( self._wchildren[wk].values())) gevent.sleep(self._freq) except gevent.GreenletExit: self._logger.error("Exiting AnalyticsDiscovery for %s" % \ self._svc_name) self._zk.remove_listener(self._zk_listen) gevent.sleep(1) try: self._zk.stop() except: self._logger.error("Stopping kazooclient failed") else: self._logger.error("Stopping kazooclient successful") try: self._zk.close() except: self._logger.error("Closing kazooclient failed") else: self._logger.error("Closing kazooclient successful") break except Exception as ex: template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._reconnect = True except Exception as ex: template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) raise SystemExit
print("LOST") elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper print("SUSPENDED") else: print("HI") # Handle being connected/reconnected to Zookeeper zk = KazooClient(hosts='zoo:2181') zk.start() zk.ensure_path("/worker") zk.ensure_path("/master") zk.add_listener(my_listener) # children=[] if (zk.exists("/worker")): children = zk.get_children("/worker") print(children) @zk.DataWatch("/master") def watch_node(data, stat, event): # pid= print("data c", data) children = zk.get_children("/worker") # print(children) print("Status >>>>>>>>>>>>>>>", stat)
class ZookeeperWatcher(object): zoo_client = None # The KazooClient to manage the config point_path = None # Zookeeper path to pointed to file pointed_at_expired = None # is True when the assignment has been set to # None but we cannot remove the config listener valid_handler = None # the function to call when the validity changes config_handler = None # the function to call when the config changes error_handler = None # the function to call when an error occurs in reading valid_file = False # the current state of the ConfigWatcher with ZK do_not_restart = False # used when closing via ^C old_data = '' # The current file contents, to see if a change occurred old_pointed = '' # the current pointed path, to see if change occurred INVALID_PATH = "Invalid pointer path" INVALID_GET = "Invalid get on file path" BAD_CONNECTION = "Connection interrupted with Zookeeper, re-establishing" def __init__(self, hosts, filepath, valid_handler=None, config_handler=None, error_handler=None, pointer=False, ensure=False, valid_init=True): ''' Zookeeper file watcher, used to tell a program their zookeeper file has changed. Can be used to watch a single file, or both a file and path of its contents. Manages all connections, drops, reconnections for you. @param hosts: The zookeeper hosts to use @param filepath: The full path to the file to watch @param valid_handler: The method to call for a 'is valid' state change @param config_handler: The method to call when a content change occurs @param error_handler: The method to call when an error occurs @param pointer: Set to true if the file contents are actually a path to another zookeeper file, where the real config resides @param ensure: Set to true for the ZooWatcher to create the watched file @param valid_init: Ensure the client can connect to Zookeeper first try Ex 1. /stuff/A: "stuff I care about" Ex 2. /stuff/A: "/other/stuff", /other/stuff: "contents I care about" - in Ex 2 you care about /other/stuff contents but are only aware of your assignment /stuff/A You can use this class as any combination of event driven or polling. Polling: In the main loop of your program, check if is_valid() is True, otherwise clear your contents as there is some ZK error. Event: You will be notified via the various handlers when content changes. ''' self.hosts = hosts self.my_file = filepath self.pointer = pointer self.ensure = ensure self.valid_handler = valid_handler self.config_handler = config_handler self.error_handler = error_handler if valid_init: # this will throw an exception if it can't start right away self.zoo_client = KazooClient(hosts=self.hosts) self.zoo_client.start() self.threaded_start(no_init=True) def threaded_start(self, no_init=False): ''' Spawns a worker thread to set up the zookeeper connection ''' thread = Thread(target=self.init_connections, kwargs={'no_init': no_init}) thread.setDaemon(True) thread.start() thread.join() def init_connections(self, no_init=False): ''' Sets up the initial Kazoo Client and watches ''' success = False self.set_valid(False) if not no_init: if self.zoo_client: self.zoo_client.remove_listener(self.state_listener) self.old_data = '' self.old_pointed = '' while not success: try: if self.zoo_client is None: self.zoo_client = KazooClient(hosts=self.hosts) self.zoo_client.start() else: # self.zoo_client.stop() self.zoo_client._connection.connection_stopped.set() self.zoo_client.close() self.zoo_client = KazooClient(hosts=self.hosts) self.zoo_client.start() except Exception as e: log.error("ZKWatcher Exception: " + e.message) sleep(1) continue self.setup() success = self.update_file(self.my_file) sleep(5) else: self.setup() self.update_file(self.my_file) def setup(self): ''' Ensures the path to the watched file exists and we have a state listener ''' self.zoo_client.add_listener(self.state_listener) if self.ensure: self.zoo_client.ensure_path(self.my_file) def state_listener(self, state): ''' Restarts the session if we get anything besides CONNECTED ''' if state == KazooState.SUSPENDED: self.set_valid(False) self.call_error(self.BAD_CONNECTION) elif state == KazooState.LOST and not self.do_not_restart: self.threaded_start() elif state == KazooState.CONNECTED: # This is going to throw a SUSPENDED kazoo error # which will cause the sessions to be wiped and re established. # Used b/c of massive connection pool issues self.zoo_client.stop() def is_valid(self): ''' @return: True if the currently watch file is valid ''' return self.valid_file def ping(self): ''' Simple command to test if the zookeeper session is able to connect at this very moment ''' try: # dummy ping to ensure we are still connected self.zoo_client.server_version() return True except KazooException: return False def close(self, kill_restart=True): ''' Use when you would like to close everything down @param kill_restart= Prevent kazoo restarting from occurring ''' self.do_not_restart = kill_restart self.zoo_client.stop() self.zoo_client.close() def get_file_contents(self, pointer=False): ''' Gets any file contents you care about. Defaults to the main file @param pointer: The the contents of the file pointer, not the pointed at file @return: A string of the contents ''' if self.pointer: if pointer: return self.old_pointed else: return self.old_data else: return self.old_data def watch_file(self, event): ''' Fired when changes made to the file ''' if not self.update_file(self.my_file): self.threaded_start() def update_file(self, path): ''' Updates the file watcher and calls the appropriate method for results @return: False if we need to keep trying the connection ''' try: # grab the file result, stat = self.zoo_client.get(path, watch=self.watch_file) result = result.decode('utf-8') except ZookeeperError: self.set_valid(False) self.call_error(self.INVALID_GET) return False if self.pointer: if result is not None and len(result) > 0: self.pointed_at_expired = False # file is a pointer, go update and watch other file self.point_path = result if self.compare_pointer(result): self.update_pointed() else: self.pointed_at_expired = True self.old_pointed = '' self.old_data = '' self.set_valid(False) self.call_error(self.INVALID_PATH) else: # file is not a pointer, return contents if self.compare_data(result): self.call_config(result) self.set_valid(True) return True def watch_pointed(self, event): ''' Fired when changes made to pointed file ''' self.update_pointed() def update_pointed(self): ''' Grabs the latest file contents based on the pointer uri ''' # only grab file if our pointer is still good (not None) if not self.pointed_at_expired: try: conf_string, stat2 = self.zoo_client.get( self.point_path, watch=self.watch_pointed) conf_string = conf_string.decode('utf-8') except ZookeeperError: self.old_data = '' self.set_valid(False) self.pointed_at_expired = True self.call_error(self.INVALID_PATH) return if self.compare_data(conf_string): self.call_config(conf_string) self.set_valid(True) def set_valid(self, boolean): ''' Sets the state and calls the change if needed @param bool: The state (true or false) ''' old_state = self.is_valid() self.valid_file = boolean if old_state != self.valid_file: self.call_valid(self.valid_file) def call_valid(self, state): ''' Calls the valid change function passed in @param valid_state: The new config ''' if self.valid_handler is not None: self.valid_handler(self.is_valid()) def call_config(self, new_config): ''' Calls the config function passed in @param new_config: The new config ''' if self.config_handler is not None: self.config_handler(new_config) def call_error(self, message): ''' Calls the error function passed in @param message: The message to throw ''' if self.error_handler is not None: self.error_handler(message) def compare_data(self, data): ''' Compares the string data @return: True if the data is different ''' if self.old_data != data: self.old_data = data return True return False def compare_pointer(self, data): ''' Compares the string data @return: True if the data is different ''' if self.old_pointed != data: self.old_pointed = data return True return False
# Create a zookeeper listener def _my_listener(state): if state == KazooState.LOST: # Register somewhere that the session was lost logger.warning("Zookeeper session lost: {}".format(state)) elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper logger.warning("Zookeeper session suspended: {}".format(state)) else: # Handle being connected/reconnected to Zookeeper logger.info("Connected to zookeeper: {}".format(state)) # Connect to Zookeeper try: logger.info("Connecting to zookeeper") zk = KazooClient(hosts='localhost:2181') zk.add_listener(_my_listener) zk.start() except Exception as e: logger.error("Unable to start the connection to Zookeeper".format(e)) # Create the base config in Zookeeper try: zk.create("/zktesting") zk.create("/zktesting/traptor") except Exception as e: logger.error("Unable to create the base Traptor config") zk.stop()
def connect(zk_quorum): logger.info('Connecting to zookeeper quorum at: {0}'.format(zk_quorum)) zk = KazooClient(hosts=zk_quorum) zk.start() zk.add_listener(connection_lost) return zk
class DeploymentConfig(object): """ Accesses deployment configuration options. """ # The ZooKeeper node where configuration is stored. CONFIG_ROOT = '/appscale/config' def __init__(self, hosts): """ Creates new DeploymentConfig object. Args: hosts: A list of ZooKeeper hosts. """ self.logger = logging.getLogger(self.__class__.__name__) self.update_lock = Lock() self.state = ConfigStates.LOADING self.config = {} self.conn = KazooClient(hosts=hosts, read_only=True) self.conn.add_listener(self._conn_listener) self.conn.start() self.conn.ensure_path(self.CONFIG_ROOT) self.conn.ChildrenWatch(self.CONFIG_ROOT, func=self._update_config) def _conn_listener(self, state): """ Handles changes in ZooKeeper connection state. Args: state: A string indicating the new state. """ if state == KazooState.LOST: self.logger.warning('ZK connection lost') if state == KazooState.SUSPENDED: self.logger.warning('ZK connection suspended') else: self.logger.info('ZK connection established') def _load_child(self, child): """ Fetches the data for a configuration node. Args: child: A string containing the ZooKeeper node to fetch. Returns: A dictionary containing configuration data. Raises: InaccessibleConfig if ZooKeeper is not accessible. """ node = '/'.join([self.CONFIG_ROOT, child]) try: data, _ = self.conn.retry(self.conn.get, node) except (KazooException, ZookeeperError): raise ConfigInaccessible('ZooKeeper connection not available') except NoNodeError: return {} try: return json.loads(data) except ValueError: self.logger.warning('Invalid deployment config: {}'.format(child)) return {} def _update_config(self, children): """ Updates configuration when it changes. Args: children: A list of ZooKeeper nodes. """ with self.update_lock: self.state = ConfigStates.LOADING # Ensure old sections are removed. self.config = {} for child in children: while True: try: self.config[child] = self._load_child(child) break except ConfigInaccessible as load_error: self.logger.warning(str(load_error)) time.sleep(SMALL_WAIT) self.logger.info('Deployment configuration updated') self.state = ConfigStates.LOADED def get_config(self, section): """ Fetches the configuration for a given section. Args: section: A string specifying the section to fetch. Returns: A dictionary containing configuration data. Raises: InaccessibleConfig if ZooKeeper is inaccessible. """ # If the connection is established, it should finish loading very soon. while (self.state == ConfigStates.LOADING and self.conn.state not in (KazooState.LOST, KazooState.SUSPENDED)): time.sleep(TINY_WAIT) if self.state != ConfigStates.LOADED: raise ConfigInaccessible('ZooKeeper connection not available') with self.update_lock: if section not in self.config: return {} return self.config[section] def close(self): """ Close the ZooKeeper connection. """ self.conn.stop()
class ZookeeperProxy(object): hook_points = ['kazoo_state_change'] def __init__(self): self._zk = None self.logger = None self._root_path = None self._hooks = Hooks(ZookeeperProxy.hook_points) def listener(self, state): if state == KazooState.LOST: # Register somewhere that the session was lost self.logger.info("listener, KazooState.LOST") elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper self.logger.info("listener, KazooState.SUSPENDED") elif state == KazooState.CONNECTED: # Handle being connected/reconnected to Zookeeper self.logger.info("listener, KazooState.CONNECTED") else: self.logger.info("listener, KazooState unknown") self.hooks.run('kazoo_state_change', state) def connect(self, ip_address, port, root_path, logger): if not self._zk: # establish zookeeper connection self._zk = KazooClient( hosts='{0}:{1}'.format(ip_address, port, logger=logger)) self._zk.start() self._zk.add_listener(self.listener) # Ensure a path, create if necessary self._root_path = root_path self._zk.ensure_path(self._root_path) self.logger = logger # make sure finalize is called when stopping nio atexit.register(self.disconnect) def disconnect(self): self.logger.info("Disconnecting") if self._zk: self._zk.stop() self._zk = None def get_children(self, node_path): try: children = self._zk.get_children(node_path) return children except NoNodeError: pass # pragma: no cover return None def fetch(self, node_path): try: data, stat = self._zk.get(node_path) if data: data = json.loads(data.decode()) except NoNodeError: data = {} # pragma: no cover return data def register(self, node_path, config): serialized_config = self._process_for_serialization(config) try: self._zk.create(node_path, serialized_config) except NodeExistsError: self._zk.set(node_path, serialized_config) def save(self, node_path, config): self._zk.set(node_path, self._process_for_serialization(config)) def remove(self, node_path): self._zk.delete(node_path, recursive=True) @staticmethod def _process_for_serialization(config): data = {k: config[k] for k in config if not k.startswith('_')} return json.dumps(data).encode() def get_root_path(self): return self._root_path @property def hooks(self): return self._hooks
class ZkConfig: def __init__(self): self.zk = KazooClient(hosts=zk_hosts) try: self.zk.start() except KazooTimeoutError as e: exit(e.args) finally: self.zk.add_listener(connection_listener) def _get_config(func): def wrapper(self, node_path_env: str, node_path_conf: str, conf_dict: dict = {}): stat = self.zk.exists(node_path_conf) if stat is None: err_msg = "%s is not exists".format(node_path_conf) logger.error(err_msg) raise DpException( ErrorConstants.ec_sys_error, ErrorConstants.error_code_message.get( ErrorConstants.ec_sys_error) + err_msg) @self.zk.ChildrenWatch(node_path_env) def watch_children(children): logger.warning("Children of %s are now: %s\n", node_path_env, children) @self.zk.DataWatch(node_path_conf) def watch_node(data, stat, event: WatchedEvent): logger.warning("Version: %s, data: %s, event is %s\n", stat.version, data.decode("utf-8"), event) data = self.zk.get(node_path_conf) conf_data = data[0].decode("utf-8") try: conf_dict = json.loads(conf_data) except: err_msg = "can't convert conf data to dict, conf_data is %s".format( conf_data) logger.error(err_msg) raise DpException( ErrorConstants.ec_sys_error, ErrorConstants.error_code_message.get( ErrorConstants.ec_sys_error) + err_msg) func(self, node_path_conf, node_path_env, conf_dict) return wrapper @_get_config def get_db_config(self, node_path_env: str, node_path_conf: str, conf_dict: dict = {}): test_dict = {} test_dict["host"] = "a" test_dict.get("host") try: db_config.host = conf_dict.get("host") db_config.user = conf_dict.get("user") db_config.passwd = conf_dict.get("passwd") db_config.db = conf_dict.get("db") db_config.port = conf_dict.get("port") except: err_msg = "param of mysql config is deficiency: %s".format( conf_dict) logger.error(err_msg) raise DpException( ErrorConstants.ec_sys_error, ErrorConstants.error_code_message.get( ErrorConstants.ec_sys_error) + err_msg) else: logger.debug(db_config)
class PartitionClient(object): """ Client Class for the Partition Library Example usage: --------------------- import libpartition from libpartition.libpartition import PartitionClient def own_change_cb(l): print "ownership change:" + str(l) c = PartitionClient("test", "s1", ["s1", "s2", "s3"], 32, own_change_cb, "zookeeper_s1") ##do some real work now" if (c.own_partition(1)): ...... do something with partition #1 ..... ......... ... c.update_cluster_list(["s1", "s2"]) ... ---------------------- You should not call any partition library routine from within the callback function Args: app_name(str): Name of the app for which partition cluster is used self_name(str): Name of the local cluster node (can be ip address) cluster_list(list): List of all the nodes in the cluster including local node max_partition(int): Partition space always go from 0..max_partition-1 partition_update_cb: Callback function invoked when partition ownership list is updated.x zk_server(str): <zookeeper server>:<zookeeper server port> """ def __init__(self, app_name, self_name, cluster_list, max_partition, partition_update_cb, zk_server, logger=None): # Initialize local variables self._zk_server = zk_server self._cluster_list = set(cluster_list) self._max_partition = max_partition self._update_cb = partition_update_cb self._curr_part_ownership_list = [] self._target_part_ownership_list = [] self._con_hash = ConsistentHash(cluster_list) self._name = self_name # some sanity check if not (self._name in cluster_list): raise ValueError('cluster list is missing local server name') # initialize logging and other stuff if logger is None: logging.basicConfig() self._logger = logging else: self._logger = logger self._conn_state = None self._sandesh_connection_info_update(status='INIT', message='') # connect to zookeeper while True: self._logger.error("Libpartition zk start") self._zk = KazooClient(zk_server) self._zk.add_listener(self._zk_listen) try: self._zk.start() while self._conn_state != ConnectionStatus.UP: gevent.sleep(1) break except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) self._zk.remove_listener(self._zk_listen) try: self._zk.stop() self._zk.close() except Exception as ex: template = "Exception {0} in Libpartition zk stop/close. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s" % \ (messag, traceback.format_exc(), self._name)) finally: self._zk = None gevent.sleep(1) # create a lock array to contain locks for each partition self._part_locks = [] for part in range(0, self._max_partition): lockpath = "/lockpath/" + app_name + "/" + str(part) l = self._zk.Lock(lockpath, self._name) self._part_locks.append(l) # initialize partition # to lock acquire greenlet dictionary self._part_lock_task_dict = {} self._logger.error("initial servers:" + str(self._cluster_list)) # update target partition ownership list for part in range(0, self._max_partition): if (self._con_hash.get_node(str(part)) == self._name): self._target_part_ownership_list.append(part) # update current ownership list self._acquire_partition_ownership() #end __init__ def _sandesh_connection_info_update(self, status, message): new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER, name='Zookeeper', status=new_conn_state, message=message, server_addrs=self._zk_server.split(',')) if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' % (message) self._logger.error(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._logger.error(msg) self._conn_state = new_conn_state # end _sandesh_connection_info_update def _zk_listen(self, state): self._logger.error("Libpartition listen %s" % str(state)) if state == KazooState.CONNECTED: # Update connection info self._sandesh_connection_info_update(status='UP', message='') elif state == KazooState.LOST: self._logger.error("Libpartition connection LOST") # Lost the session with ZooKeeper Server # Best of option we have is to exit the process and restart all # over again self._sandesh_connection_info_update( status='DOWN', message='Connection to Zookeeper lost') os._exit(2) elif state == KazooState.SUSPENDED: self._logger.error("Libpartition connection SUSPENDED") # Update connection info self._sandesh_connection_info_update( status='INIT', message='Connection to zookeeper lost. Retrying') # following routine is the greenlet task function to acquire the lock # for a partition def _acquire_lock(self, part): # lock for the partition l = self._part_locks[part] # go in an infinite loop waiting to acquire the lock try: while True: ret = l.acquire(blocking=False) if ret == True: self._logger.error("Acquired lock for:" + str(part)) self._curr_part_ownership_list.append(part) self._update_cb(self._curr_part_ownership_list) return True else: gevent.sleep(1) except CancelledError: self._logger.error("Lock acquire cancelled for:" + str(part)) return False except Exception as ex: # TODO: If we have a non-KazooException, the lock object # may get stuck in the "cancelled" state self._logger.error("Lock acquire unexpected error!: " + str(ex)) # This exception should get propogated to main thread raise SystemExit(1) return False #end _acquire_lock # get rid of finished spawned tasks from datastructures def _cleanup_greenlets(self): for part in self._part_lock_task_dict.keys(): if (self._part_lock_task_dict[part].ready()): del self._part_lock_task_dict[part] #end _cleanup_greenlets # following routine launches tasks to acquire partition locks def _acquire_partition_ownership(self): # cleanup any finished greenlets self._cleanup_greenlets() # this variable will help us decide if we need to call callback updated_curr_ownership = False # list of partitions for which locks have to be released release_lock_list = [] self._logger.info("known servers: %s" % self._con_hash.get_all_nodes()) for part in range(0, self._max_partition): if (part in self._target_part_ownership_list): if (part in self._curr_part_ownership_list): # do nothing, I already have ownership of this partition self._logger.info("No need to acquire ownership of:" + str(part)) else: # I need to acquire lock for this partition before I own if (part in self._part_lock_task_dict.keys()): try: self._part_lock_task_dict[part].get(block=False) except: # do nothing there is already a greenlet running to # acquire the lock self._logger.error("Already a greenlet running to" " acquire:" + str(part)) continue # Greenlet died without getting ownership. Cleanup self._logger.error("Cleanup stale greenlet running to" " acquire:" + str(part)) del self._part_lock_task_dict[part] self._logger.error("Starting greenlet running to" " acquire:" + str(part)) # launch the greenlet to acquire the loc, k g = Greenlet.spawn(self._acquire_lock, part) self._part_lock_task_dict[part] = g else: # give up ownership of the partition # cancel any lock acquisition which is ongoing if (part in self._part_lock_task_dict.keys()): try: self._part_lock_task_dict[part].get(block=False) except: self._logger.error( "canceling lock acquisition going on \ for:" + str(part)) # Cancelling the lock should result in killing the gevent self._part_locks[part].cancel() self._part_lock_task_dict[part].get(block=True) del self._part_lock_task_dict[part] if (part in self._curr_part_ownership_list): release_lock_list.append(part) self._curr_part_ownership_list.remove(part) updated_curr_ownership = True self._logger.error("giving up ownership of:" + str(part)) if (updated_curr_ownership is True): # current partition membership was updated call the callback self._update_cb(self._curr_part_ownership_list) if (len(release_lock_list) != 0): # release locks which were acquired for part in release_lock_list: self._logger.error("release the lock which was acquired:" + \ str(part)) try: self._part_locks[part].release() self._logger.error("fully gave up ownership of:" + str(part)) except: pass #end _acquire_partition_ownership def update_cluster_list(self, cluster_list): """ Updates the cluster node list Args: cluster_list(list): New list of names of the nodes in the cluster Returns: None """ # some sanity check if not (self._name in cluster_list): raise ValueError('cluster list is missing local server name') new_cluster_list = set(cluster_list) new_servers = list(new_cluster_list.difference(self._cluster_list)) deleted_servers = list( set(self._cluster_list).difference(new_cluster_list)) self._cluster_list = set(cluster_list) # update the hash structure if new_servers: self._logger.error("new servers:" + str(new_servers)) self._con_hash.add_nodes(new_servers) if deleted_servers: self._logger.error("deleted servers:" + str(deleted_servers)) self._con_hash.del_nodes(deleted_servers) # update target partition ownership list self._target_part_ownership_list = [] for part in range(0, self._max_partition): if (self._con_hash.get_node(str(part)) == self._name): if not (part in self._target_part_ownership_list): self._target_part_ownership_list.append(part) # update current ownership list self._acquire_partition_ownership() #end update_cluster_list def own_partition(self, part_no): """ Returns ownership information of a partition Args: part_no(int) : Partition no Returns: True if partition is owned by the local node False if partition is not owned by the local node """ return part_no in self._curr_part_ownership_list #end own_partition def close(self): """ Closes any connections and frees up any data structures Args: Returns: None """ # clean up greenlets for part in self._part_lock_task_dict.keys(): try: self._logger.error("libpartition greenlet cleanup %s" % str(part)) self._part_lock_task_dict[part].kill() except: pass self._zk.remove_listener(self._zk_listen) gevent.sleep(1) self._logger.error("Stopping libpartition") # close zookeeper try: self._zk.stop() except: self._logger.error("Stopping libpartition failed") else: self._logger.error("Stopping libpartition successful") self._logger.error("Closing libpartition") try: self._zk.close() except: self._logger.error("Closing libpartition failed") else: self._logger.error("Closing libpartition successful")
PUT_SUCCESS = 0 PUT_ERROR = -1 PUT_PROP_SUCCESS = 0 PUT_PROP_ERROR = -1 DELETE_SUCCESS = 0 DELETE_ERROR = -1 DELETE_PROP_SUCCESS = 0 DELETE_PROP_ERROR = -1 DUMP_SUCCESS = 0 DUMP_ERROR = -1 zk_host = "127.0.0.1" zk_port = 2181 zk = KazooClient(hosts=(zk_host + ":" + str(zk_port))) zk.start() zk.add_listener(zk_state_listener) host = "" port = -1 GroupId = -1 ServerId = -1 peer_infos = [] group_infos = {} model = None hash_table = None class serverRPC: def get(self, key): try: return model.get(key)
class ZooKeeper(object): ''' Class implementing the ZooKeeper interface. This class uses the facade design pattern to keep common interaction with the ZooKeeper API simple and consistent for the caller, and limits coupling between objects. It allows for more complex interactions by providing direct access to the client connection when needed (though that is discouraged). It also provides for a convenient entry point for testing only ZooKeeper interactions. ''' log = logging.getLogger("zuul.zk.ZooKeeper") REQUEST_ROOT = '/nodepool/requests' NODE_ROOT = '/nodepool/nodes' # Log zookeeper retry every 10 seconds retry_log_rate = 10 def __init__(self): ''' Initialize the ZooKeeper object. ''' self.client = None self._became_lost = False self._last_retry_log = 0 def _dictToStr(self, data): return json.dumps(data).encode('utf8') def _strToDict(self, data): return json.loads(data.decode('utf8')) def _connection_listener(self, state): ''' Listener method for Kazoo connection state changes. .. warning:: This method must not block. ''' if state == KazooState.LOST: self.log.debug("ZooKeeper connection: LOST") self._became_lost = True elif state == KazooState.SUSPENDED: self.log.debug("ZooKeeper connection: SUSPENDED") else: self.log.debug("ZooKeeper connection: CONNECTED") @property def connected(self): return self.client.state == KazooState.CONNECTED @property def suspended(self): return self.client.state == KazooState.SUSPENDED @property def lost(self): return self.client.state == KazooState.LOST @property def didLoseConnection(self): return self._became_lost def resetLostFlag(self): self._became_lost = False def logConnectionRetryEvent(self): now = time.monotonic() if now - self._last_retry_log >= self.retry_log_rate: self.log.warning("Retrying zookeeper connection") self._last_retry_log = now def connect(self, hosts, read_only=False, timeout=10.0): ''' Establish a connection with ZooKeeper cluster. Convenience method if a pre-existing ZooKeeper connection is not supplied to the ZooKeeper object at instantiation time. :param str hosts: Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). :param bool read_only: If True, establishes a read-only connection. :param float timeout: The ZooKeeper session timeout, in seconds (default: 10.0). ''' if self.client is None: self.client = KazooClient(hosts=hosts, read_only=read_only, timeout=timeout) self.client.add_listener(self._connection_listener) # Manually retry initial connection attempt while True: try: self.client.start(1) break except KazooTimeoutError: self.logConnectionRetryEvent() def disconnect(self): ''' Close the ZooKeeper cluster connection. You should call this method if you used connect() to establish a cluster connection. ''' if self.client is not None and self.client.connected: self.client.stop() self.client.close() self.client = None def resetHosts(self, hosts): ''' Reset the ZooKeeper cluster connection host list. :param str hosts: Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). ''' if self.client is not None: self.client.set_hosts(hosts=hosts) def submitNodeRequest(self, node_request, watcher): ''' Submit a request for nodes to Nodepool. :param NodeRequest node_request: A NodeRequest with the contents of the request. :param callable watcher: A callable object that will be invoked each time the request is updated. It is called with two arguments: (node_request, deleted) where node_request is the same argument passed to this method, and deleted is a boolean which is True if the node no longer exists (notably, this will happen on disconnection from ZooKeeper). The watcher should return False when further updates are no longer necessary. ''' data = node_request.toDict() data['created_time'] = time.time() path = '%s/%s-' % (self.REQUEST_ROOT, node_request.priority) path = self.client.create(path, self._dictToStr(data), makepath=True, sequence=True, ephemeral=True) reqid = path.split("/")[-1] node_request.id = reqid def callback(data, stat): if data: data = self._strToDict(data) request_nodes = list(node_request.nodeset.getNodes()) for i, nodeid in enumerate(data.get('nodes', [])): node_path = '%s/%s' % (self.NODE_ROOT, nodeid) node_data, node_stat = self.client.get(node_path) node_data = self._strToDict(node_data) request_nodes[i].id = nodeid request_nodes[i].updateFromDict(node_data) node_request.updateFromDict(data) deleted = (data is None) # data *are* none return watcher(node_request, deleted) self.client.DataWatch(path, callback) def deleteNodeRequest(self, node_request): ''' Delete a request for nodes. :param NodeRequest node_request: A NodeRequest with the contents of the request. ''' path = '%s/%s' % (self.REQUEST_ROOT, node_request.id) try: self.client.delete(path) except kze.NoNodeError: pass def nodeRequestExists(self, node_request): ''' See if a NodeRequest exists in ZooKeeper. :param NodeRequest node_request: A NodeRequest to verify. :returns: True if the request exists, False otherwise. ''' path = '%s/%s' % (self.REQUEST_ROOT, node_request.id) if self.client.exists(path): return True return False def storeNode(self, node): '''Store the node. The node is expected to already exist and is updated in its entirety. :param Node node: The node to update. ''' path = '%s/%s' % (self.NODE_ROOT, node.id) self.client.set(path, self._dictToStr(node.toDict())) def lockNode(self, node, blocking=True, timeout=None): ''' Lock a node. This should be called as soon as a request is fulfilled and the lock held for as long as the node is in-use. It can be used by nodepool to detect if Zuul has gone offline and the node should be reclaimed. :param Node node: The node which should be locked. ''' lock_path = '%s/%s/lock' % (self.NODE_ROOT, node.id) try: lock = Lock(self.client, lock_path) have_lock = lock.acquire(blocking, timeout) except kze.LockTimeout: raise LockException("Timeout trying to acquire lock %s" % lock_path) # If we aren't blocking, it's possible we didn't get the lock # because someone else has it. if not have_lock: raise LockException("Did not get lock on %s" % lock_path) node.lock = lock def unlockNode(self, node): ''' Unlock a node. The node must already have been locked. :param Node node: The node which should be unlocked. ''' if node.lock is None: raise LockException("Node %s does not hold a lock" % (node, )) node.lock.release() node.lock = None def heldNodeCount(self, autohold_key): ''' Count the number of nodes being held for the given tenant/project/job. :param set autohold_key: A set with the tenant/project/job names. ''' identifier = " ".join(autohold_key) try: nodes = self.client.get_children(self.NODE_ROOT) except kze.NoNodeError: return 0 count = 0 for nodeid in nodes: node_path = '%s/%s' % (self.NODE_ROOT, nodeid) node_data, node_stat = self.client.get(node_path) if not node_data: self.log.warning("Node ID %s has no data", nodeid) continue node_data = self._strToDict(node_data) if (node_data['state'] == zuul.model.STATE_HOLD and node_data.get('hold_job') == identifier): count += 1 return count
class ZookeeperServiceRegistry(BaseServiceRegistry): def __init__(self, hosts=DEFAULT_HOSTS, chroot=DEFAULT_CHROOT): super(ZookeeperServiceRegistry, self).__init__() self.chroot = chroot self.client = KazooClient( hosts=hosts, handler=SequentialGeventHandler(), ) self.client.add_listener(self.on_kazoo_state_change) self.start_count = 0 @classmethod def from_config(cls, config, **kwargs): return cls(hosts=config.get('hosts', DEFAULT_HOSTS), chroot=config.get('chroot', DEFAULT_CHROOT), **kwargs) def on_start(self, timeout=10): self.start_count += 1 if self.start_count > 1: return started = self.client.start_async() started.wait(timeout=timeout) if not self.client.connected: raise RuntimeError('could not connect to zookeeper') logger.debug('connected to zookeeper (version=%s)', '.'.join(map(str, self.client.server_version()))) def on_stop(self): self.start_count -= 1 if self.start_count != 0: return self.client.stop() def on_kazoo_state_change(self, state): logger.info('kazoo connection state changed to %s', state) def on_service_type_watch(self, service, event): try: if event.type == EventType.CHILD: # FIXME: figure out proper retry strategy self.client.retry(self.lookup, service.container, service) except Exception: logger.exception('error in service type watcher') def on_service_watch(self, service, event): try: prefix, service_type, identity = event.path.rsplit('/', 2) if event.type == EventType.DELETED: service.remove(identity) except Exception: logger.exception('error in service watcher') def _get_service_znode(self, service, service_type, identity): path = self._get_zk_path(service_type, identity) result = self.client.get_async(path, watch=functools.partial( self.on_service_watch, service)) value, znode = result.get() items = six.iteritems(json.loads(value.decode('utf-8'))) return {str(k): str(v) for k, v in items} def discover(self, container): result = self.client.get_children_async(path='%s/services' % self.chroot, ) return list(result.get()) def lookup(self, container, service, watch=True, timeout=1): def child_watch(event): print(event) service_type = service.service_type result = self.client.get_children_async( path='%s/services/%s' % (self.chroot, service_type), watch=functools.partial(self.on_service_type_watch, service), ) try: names = result.get(timeout=timeout) except NoNodeError: raise LookupFailure(None, "failed to resolve %s" % service.service_type) logger.info("lookup %s %r", service_type, names) identities = set(service.identities()) for name in names: kwargs = self._get_service_znode(service, service_type, name) identity = kwargs.pop('identity') service.update(identity, **kwargs) try: identities.remove(identity) except KeyError: pass for identity in identities: service.remove(identity) return service def _get_zk_path(self, service_type, identity): return '%s/services/%s/%s' % (self.chroot, service_type, identity) def register(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) value = json.dumps({ 'endpoint': container.endpoint, 'identity': container.identity, 'log_endpoint': container.log_endpoint, }) result = self.client.create_async(path, value.encode('utf-8'), ephemeral=True, makepath=True) # FIXME: result.set_exception(RegistrationFailure()) result.get(timeout=timeout) def unregister(self, container, service_type, timeout=1): path = self._get_zk_path(service_type, container.identity) result = self.client.delete_async(path) result.set_exception(RegistrationFailure()) result.get(timeout=timeout)
class ZookeeperRegistry(Registry): _app_config = ApplicationConfig('default_app') _connect_state = 'UNCONNECT' def __init__(self, zk_hosts, application_config=None): if application_config: self._app_config = application_config self.__zk = KazooClient(hosts=zk_hosts) self.__zk.add_listener(self.__state_listener) self.__zk.start() def __state_listener(self, state): if state == KazooState.LOST: # Register somewhere that the session was lost self._connect_state = state elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper # print 'disconnect from zookeeper' self._connect_state = state else: # Handle being connected/reconnected to Zookeeper # print 'connected' self._connect_state = state def __unquote(self, origin_nodes): return (urllib.parse.unquote(child_node) for child_node in origin_nodes if child_node) #decode('utf8') def _do_event(self, event): # event.path 是类似/dubbo/com.ofpay.demo.api.UserProvider/providers 这样的 # 如果要删除,必须先把/dubbo/和最后的/providers去掉 # 将zookeeper中查询到的服务节点列表加入到一个dict中 # zookeeper中保持的节点url类似如下 provide_name = event.path[7:event.path.rfind('/')] if event.state == 'CONNECTED': children = self.__zk.get_children(event.path, watch=self.event_listener) self._compare_swap_nodes(provide_name, self.__unquote(children)) if event.state == 'DELETED': children = self.__zk.get_children(event.path, watch=self.event_listener) self._compare_swap_nodes(provide_name, self.__unquote(children)) def register(self, interface, **kwargs): ip = self.__zk._connection._socket.getsockname()[0] params = { 'interface': interface, 'application': self._app_config.name, 'application.version': self._app_config.version, 'category': 'consumer', 'dubbo': 'dubbo-client-py-1.0.0', 'environment': self._app_config.environment, 'method': '', 'owner': self._app_config.owner, 'side': 'consumer', 'pid': os.getpid(), 'version': '1.0' } url = 'consumer://{0}/{1}?{2}'.format(ip, interface, urllib.parse.urlencode(params)) # print urllib.quote(url, safe='') consumer_path = '{0}/{1}/{2}'.format('dubbo', interface, 'consumers') self.__zk.ensure_path(consumer_path) if not self.__zk.exists(consumer_path + '/' + urllib.parse.quote(url, safe='')): self.__zk.create(consumer_path + '/' + urllib.parse.quote(url, safe=''), ephemeral=True) def subscribe(self, interface, **kwargs): """ 监听注册中心的服务上下线 :param interface: 类似com.ofpay.demo.api.UserProvider这样的服务名 :return: 无返回 """ version = kwargs.get('version', '') group = kwargs.get('group', '') children = self.__zk.get_children('{0}/{1}/{2}'.format( 'dubbo', interface, 'providers'), watch=self.event_listener) # 全部重新添加 self._compare_swap_nodes(interface, self.__unquote(children))
class BaseZKgRPC(): ZK_ENDPOINT = '127.0.0.1:2181' CA_FILE = False CLIENT_CERT = False CLIENT_KEY = False def __init__(self): """Constructor""" # immediately connect to zookeeper self.zk = KazooClient(hosts=self.ZK_ENDPOINT, read_only=True) self.zk.start() self.zk_connected = True # add state change listener to monitor zk connection events self.zk.add_listener(self.kazoo_listener) # register deconstructor to run on exit atexit.register(self.__del__) self.channel = False self.stub = False # determine stub class and import stub_package, stub_class = self.STUB_CLASS.rsplit('.', 1) self.stub_class = getattr(importlib.import_module(stub_package), stub_class) # do the same for all methods and generate instance methods for method_name, request_class_package in self.METHODS: request_package, request_class_name = request_class_package.rsplit( '.', 1) request_class = getattr(importlib.import_module(request_package), request_class_name) # create method to call lambda which passes info to generic method setattr( self, method_name, lambda method_name=method_name, request_class=request_class, ** v: self.call_method(method_name, request_class, **v)) def kazoo_listener(self, state): """Zookeeper state change handler monitors connection and flags availability""" if state == KazooState.LOST: LOGGER.info("Kazoo session lost") self.zk_connected = False elif state == KazooState.SUSPENDED: LOGGER.info("Kazoo disconnected") self.zk_connected = False else: LOGGER.info("Kazoo connected") self.zk_connected = True def connect(self): """Connect to gRPC endpoint based on TLS config""" endpoint = self.get_endpoint() LOGGER.info('Using endpoint: {}'.format(endpoint)) # make TLS connection if given a root certificate if self.CA_FILE: self.channel = grpc.secure_channel(self.get_endpoint(), self.get_credentials()) else: self.channel = grpc.insecure_channel(self.get_endpoint()) self.stub = self.stub_class(self.channel) def get_endpoint(self): """Queries Zookeeper for a random available gRPC endpoint""" hosts = [] # loop until at least one host is returned while len(hosts) < 1: try: # wait until zookeeper is flagged as available while not self.zk_connected: LOGGER.info("Waiting for Zookeeper connection") time.sleep(1) # iterate keys within the root brokers = [ json.loads( self.zk.get('{}/{}'.format(self.ZK_KEY, node))[0]) for node in self.zk.get_children(self.ZK_KEY) ] # build endpoints from returned json hosts = ['%s:%d' % (b['host'], b['port']) for b in brokers] # wait until at least one host is returned if len(hosts) == 0: LOGGER.info( "Waiting for hosts to be available in {}".format( self.ZK_KEY)) time.sleep(1) # handle connection issues and try again except NoNodeError: time.sleep(1) except ConnectionLoss: time.sleep(1) # choose a random host return random.choice(hosts) def get_credentials(self): """Creates TLS credentials""" with open(self.CA_FILE, 'rb') as f: ca_trust = f.read() return grpc.ssl_channel_credentials(root_certificates=ca_trust) def call_method(self, method_name, request_class, *args, **kwargs): """Generic method to build then send gRPC request""" # ensure we're connected if not self.channel or not self.stub: self.connect() # create a valid request message request = request_class(**kwargs) # reconnect automatically while True: try: # make the call response = getattr(self.stub, method_name)(request) return response except grpc._channel._Rendezvous as ex: # handle events that can be reconnected if ex.code() == grpc.StatusCode.UNAVAILABLE or ex.code( ) == grpc.StatusCode.INTERNAL: # reconnect LOGGER.info('Reconnecting...') self.connect() else: raise ex def __del__(self): """Ends zookeeper session and closes connection""" self.zk.stop() self.zk.close()
while success != True: try: redis_connection.slaveof(host=tHost, port=tPort) success = True except redis.ConnectionError: if timeout <= 0: raise OSError, "Timeout reached. Couldn't connect to Redis." print "Can't connect to Redis. Sleeping for %d seconds..." % retry_time sys.stdout.flush() timeout += retry_time sleep(retry_time) def touch(fname, times=None): with open(fname, 'a'): os.utime(fname, times) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) print "Redis %s %s" % (redis_host, redis_port) sys.stdout.flush() redis_connection = redis.StrictRedis(host=redis_host, port=redis_port, db=0) zk_connection = KazooClient(hosts=zk_hosts) zk_connection.add_listener(zookeeper_listener) zk_connection.start() start_election_and_take_position()
class AnalyticsDiscovery(gevent.Greenlet): def _sandesh_connection_info_update(self, status, message): new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type = ConnectionType.ZOOKEEPER, name = self._svc_name, status = new_conn_state, message = message, server_addrs = self._zk_server.split(',')) if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' %(message) self._logger.error(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._logger.error(msg) self._conn_state = new_conn_state #import pdb; pdb.set_trace() # end _sandesh_connection_info_update def _zk_listen(self, state): self._logger.error("Analytics Discovery listen %s" % str(state)) if state == KazooState.CONNECTED: if self._conn_state != ConnectionStatus.UP: self._sandesh_connection_info_update(status='UP', message='') self._logger.error("Analytics Discovery to publish %s" % str(self._pubinfo)) self._reconnect = True else: self._logger.error("Analytics Discovery already connected") else: self._logger.error("Analytics Discovery NOT connected") if self._conn_state == ConnectionStatus.UP: self._sandesh_connection_info_update(status='DOWN', message='') def _zk_datawatch(self, watcher, child, data, stat, event="unknown"): self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (watcher, child, data, event)) if data: data_dict = json.loads(data) self._wchildren[watcher][child] = OrderedDict(sorted(data_dict.items())) else: if child in self._wchildren[watcher]: del self._wchildren[watcher][child] if self._watchers[watcher]: self._pendingcb.add(watcher) def _zk_watcher(self, watcher, children): self._logger.error("Analytics Discovery Children %s" % children) self._reconnect = True def __init__(self, logger, zkservers, svc_name, inst, watchers={}, zpostfix="", freq=10): gevent.Greenlet.__init__(self) self._svc_name = svc_name self._inst = inst self._zk_server = zkservers # initialize logging and other stuff if logger is None: logging.basicConfig() self._logger = logging else: self._logger = logger self._conn_state = None self._sandesh_connection_info_update(status='INIT', message='') self._zk = KazooClient(hosts=zkservers) self._pubinfo = None self._watchers = watchers self._wchildren = {} self._pendingcb = set() self._zpostfix = zpostfix self._basepath = "/analytics-discovery-" + self._zpostfix self._reconnect = None self._freq = freq def publish(self, pubinfo): self._pubinfo = pubinfo #import pdb; pdb.set_trace() if self._conn_state == ConnectionStatus.UP: try: self._logger.error("ensure %s" % (self._basepath + "/" + self._svc_name)) self._logger.error("zk state %s (%s)" % (self._zk.state, self._zk.client_state)) self._zk.ensure_path(self._basepath + "/" + self._svc_name) self._logger.error("check for %s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)) if pubinfo is not None: if self._zk.exists("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)): self._zk.set("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo) else: self._zk.create("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo, ephemeral=True) else: if self._zk.exists("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)): self._logger.error("withdrawing published info!") self._zk.delete("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)) except Exception as ex: template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._sandesh_connection_info_update(status='DOWN', message='') self._reconnect = True else: self._logger.error("Analytics Discovery cannot publish while down") def _run(self): while True: try: self._zk.start() break except gevent.event.Timeout as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) # Zookeeper is also throwing exception due to delay in master election except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) try: # Update connection info self._sandesh_connection_info_update(status='UP', message='') self._reconnect = False # Done connecting to ZooKeeper self._zk.add_listener(self._zk_listen) for wk in self._watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) self._wchildren[wk] = {} self._zk.ChildrenWatch(self._basepath + "/" + wk, partial(self._zk_watcher, wk)) # Trigger the initial publish self._reconnect = True while True: try: if not self._reconnect: pending_list = list(self._pendingcb) self._pendingcb = set() for wk in pending_list: if self._watchers[wk]: self._watchers[wk](\ sorted(self._wchildren[wk].values())) # If a reconnect happens during processing, don't lose it while self._reconnect: self._logger.error("Analytics Discovery %s reconnect" \ % self._svc_name) self._reconnect = False self._pendingcb = set() self.publish(self._pubinfo) for wk in self._watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) children = self._zk.get_children(self._basepath + "/" + wk) old_children = set(self._wchildren[wk].keys()) new_children = set(children) # Remove contents for the children who are gone # (DO NOT remove the watch) for elem in old_children - new_children: del self._wchildren[wk][elem] # Overwrite existing children, or create new ones for elem in new_children: # Create a watch for new children if elem not in self._wchildren[wk]: self._zk.DataWatch(self._basepath + "/" + \ wk + "/" + elem, partial(self._zk_datawatch, wk, elem)) data_str, _ = self._zk.get(\ self._basepath + "/" + wk + "/" + elem) data_dict = json.loads(data_str) self._wchildren[wk][elem] = \ OrderedDict(sorted(data_dict.items())) self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (wk, elem, self._wchildren[wk][elem], "GET")) if self._watchers[wk]: self._watchers[wk](sorted(self._wchildren[wk].values())) gevent.sleep(self._freq) except gevent.GreenletExit: self._logger.error("Exiting AnalyticsDiscovery for %s" % \ self._svc_name) self._zk.stop() break except Exception as ex: template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._reconnect = True except Exception as ex: template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) raise SystemExit
class _ZookeeperProxy(object): def __init__(self, address_provider: AddressListProvider, prefix: str): self.address_provider = address_provider self.async_counter = WaitingCounter(limit=100) self.conn_str = None self.client = None self.prefix = prefix self.hosts_cache = SlowlyUpdatedCache( self.address_provider.get_latest_address, self._update_hosts, 30, # Refresh every 30 seconds 3 * 60) # Update only after 180 seconds of stability def _update_hosts(self, value): hosts, port = value if hosts: self.conn_str = ','.join(['{}:{}'.format(h, port) for h in hosts]) + self.prefix if self.client is None: self.client = KazooClient(hosts=self.conn_str, command_retry={ 'deadline': 10, 'max_delay': 1, 'max_tries': -1 }, connection_retry={ 'max_delay': 1, 'max_tries': -1 }) self.client.add_listener(self.session_listener) else: self.client.stop() self.client.set_hosts(self.conn_str) self.client.start() def terminate(self): if self.client: self.client.stop() def session_listener(self, state): pass def get_conn_str(self): return self.conn_str def get(self, *params): self.hosts_cache.touch() return self.client.retry(self.client.get, *params) def get_async(self, *params): # Exhibitor is not polled here and it's totally fine! self.async_counter.increment() try: i_async = self.client.get_async(*params) i_async.rawlink(self._decrement) return i_async except Exception as e: self._decrement() raise e def _decrement(self, *args, **kwargs): self.async_counter.decrement() def set(self, *args, **kwargs): self.hosts_cache.touch() return self.client.retry(self.client.set, *args, **kwargs) def create(self, *args, **kwargs): self.hosts_cache.touch() return self.client.retry(self.client.create, *args, **kwargs) def delete(self, *args, **kwargs): self.hosts_cache.touch() try: return self.client.retry(self.client.delete, *args, **kwargs) except NoNodeError: pass def get_children(self, *params): self.hosts_cache.touch() try: return self.client.retry(self.client.get_children, *params) except NoNodeError: return [] def take_lock(self, *args, **kwargs): while True: try: self.hosts_cache.touch() return self.client.Lock(*args, **kwargs) except Exception as e: _LOG.error('Failed to obtain lock for exhibitor, retrying', exc_info=e)
class ZooKeeper(object): # Constants used by the REST API: LIVE_NODES_ZKNODE = "/live_nodes" ALIASES = "/aliases.json" CLUSTER_STATE = "/clusterstate.json" COLLECTION_STATUS = "/collections" COLLECTION_STATE = "/collections/%s/state.json" SHARDS = "shards" REPLICAS = "replicas" STATE = "state" ACTIVE = "active" LEADER = "leader" BASE_URL = "base_url" TRUE = "true" FALSE = "false" COLLECTION = "collection" def __init__(self, zkServerAddress, timeout=15, max_retries=-1, kazoo_client=None): if KazooClient is None: logging.error( "ZooKeeper requires the `kazoo` library to be installed") raise RuntimeError self.collections = {} self.liveNodes = {} self.aliases = {} self.state = None if kazoo_client is None: self.zk = KazooClient( zkServerAddress, read_only=True, timeout=timeout, command_retry={"max_tries": max_retries}, connection_retry={"max_tries": max_retries}, ) else: self.zk = kazoo_client self.zk.start() def connectionListener(state): if state == KazooState.LOST: self.state = state elif state == KazooState.SUSPENDED: self.state = state self.zk.add_listener(connectionListener) @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE) def watchClusterState(data, *args, **kwargs): if not data: logger.warning( "No cluster state available: no collections defined?") else: self.collections = json.loads(data.decode("utf-8")) logger.info("Updated collections: %s", self.collections) @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE) def watchLiveNodes(children): self.liveNodes = children logger.info("Updated live nodes: %s", children) @self.zk.DataWatch(ZooKeeper.ALIASES) def watchAliases(data, stat): if data: json_data = json.loads(data.decode("utf-8")) if ZooKeeper.COLLECTION in json_data: self.aliases = json_data[ZooKeeper.COLLECTION] else: logger.warning( "Expected to find %s in alias update %s", ZooKeeper.COLLECTION, json_data.keys(), ) else: self.aliases = None logger.info("Updated aliases: %s", self.aliases) def watchCollectionState(data, *args, **kwargs): if not data: logger.warning( "No cluster state available: no collections defined?") else: self.collections.update(json.loads(data.decode("utf-8"))) logger.info("Updated collections: %s", self.collections) @self.zk.ChildrenWatch(ZooKeeper.COLLECTION_STATUS) def watchCollectionStatus(children): logger.info("Updated collection: %s", children) for c in children: self.zk.DataWatch(self.COLLECTION_STATE % c, watchCollectionState) def getHosts(self, collname, only_leader=False, seen_aliases=None): if self.aliases and collname in self.aliases: return self.getAliasHosts(collname, only_leader, seen_aliases) hosts = [] if collname not in self.collections: raise SolrError("Unknown collection: %s" % collname) collection = self.collections[collname] shards = collection[ZooKeeper.SHARDS] for shardname in shards.keys(): shard = shards[shardname] if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE: replicas = shard[ZooKeeper.REPLICAS] for replicaname in replicas.keys(): replica = replicas[replicaname] if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE: if not only_leader or (replica.get( ZooKeeper.LEADER, None) == ZooKeeper.TRUE): base_url = replica[ZooKeeper.BASE_URL] if base_url not in hosts: hosts.append(base_url) return hosts def getAliasHosts(self, collname, only_leader, seen_aliases): if seen_aliases: if collname in seen_aliases: logger.warning("%s in circular alias definition - ignored", collname) return [] else: seen_aliases = [] seen_aliases.append(collname) collections = self.aliases[collname].split(",") hosts = [] for collection in collections: for host in self.getHosts(collection, only_leader, seen_aliases): if host not in hosts: hosts.append(host) return hosts def getRandomURL(self, collname, only_leader=False): hosts = self.getHosts(collname, only_leader=only_leader) if not hosts: raise SolrError("ZooKeeper returned no active shards!") return "%s/%s" % (random.choice(hosts), collname) # NOQA: B311 def getLeaderURL(self, collname): return self.getRandomURL(collname, only_leader=True)
class Coordinator(object): def __init__(self, zk_hosts, hostname, port, join_cluster): self.me = '%s:%s' % (hostname, port) self.is_leader = None self.followers = cycle([]) self.follower_count = 0 self.started_shutdown = False if join_cluster: read_only = False else: read_only = True self.zk = KazooClient(hosts=zk_hosts, handler=SequentialGeventHandler(), read_only=read_only) event = self.zk.start_async() event.wait(timeout=5) self.lock = self.zk.Lock(path='/iris/sender_leader', identifier=self.me) # Used to keep track of followers / senders present in cluster self.party = Party(client=self.zk, path='/iris/sender_nodes', identifier=self.me) if join_cluster: self.zk.add_listener(self.event_listener) self.party.join() def am_i_leader(self): return self.is_leader # Used for API to get the current leader def get_current_leader(self): try: contenders = self.lock.contenders() except kazoo.exceptions.KazooException: logger.exception('Failed getting contenders') return None if contenders: return self.address_to_tuple(contenders[0]) else: return None # Used for API to get the current followers if leader can't be reached def get_current_followers(self): return [self.address_to_tuple(host) for host in self.party] def address_to_tuple(self, address): try: host, port = address.split(':') return host, int(port) except (IndexError, ValueError): logger.error('Failed getting address tuple from %s', address) return None def update_status(self): if self.started_shutdown: return if self.zk.state == KazooState.CONNECTED: if self.lock.is_acquired: self.is_leader = True else: try: self.is_leader = self.lock.acquire(blocking=False, timeout=2) # This one is expected when we're recovering from ZK being down except kazoo.exceptions.CancelledError: self.is_leader = False except kazoo.exceptions.LockTimeout: self.is_leader = False logger.exception('Failed trying to acquire lock (shouldn\'t happen as we\'re using nonblocking locks)') except kazoo.exceptions.KazooException: self.is_leader = False logger.exception('ZK problem while Failed trying to acquire lock') else: logger.error('ZK connection is in %s state', self.zk.state) self.is_leader = False if self.zk.state == KazooState.CONNECTED: if self.is_leader: followers = [self.address_to_tuple(host) for host in self.party if host != self.me] self.follower_count = len(followers) self.followers = cycle(followers) else: self.followers = cycle([]) self.follower_count = 0 # Keep us as part of the party, so the current leader sees us as a follower if not self.party.participating: try: self.party.join() except kazoo.exceptions.KazooException: logger.exception('ZK problem while trying to join party') else: self.followers = cycle([]) self.follower_count = 0 def update_forever(self): while True: if self.started_shutdown: return old_status = self.is_leader self.update_status() new_status = self.is_leader if old_status != new_status: log = logger.info else: log = logger.debug if self.is_leader: log('I am the leader sender') else: log('I am a follower sender') metrics.set('follower_instance_count', self.follower_count) metrics.set('is_leader_sender', int(self.is_leader is True)) sleep(UPDATE_FREQUENCY) def leave_cluster(self): self.started_shutdown = True # cancel any attempts to acquire leader lock which could make us hang self.lock.cancel() if self.zk.state == KazooState.CONNECTED: if self.party and self.party.participating: logger.info('Leaving party') self.party.leave() if self.lock and self.lock.is_acquired: logger.info('Releasing lock') self.lock.release() # Make us not the leader self.is_leader = False # Avoid sending metrics that we are still the leader when we're not metrics.set('is_leader_sender', 0) def event_listener(self, state): if state == KazooState.LOST or state == KazooState.SUSPENDED: logger.info('ZK state transitioned to %s. Resetting leader status.', state) # cancel pending attempts to acquire lock which will break and leave # us in bad state self.lock.cancel() # make us try to re-acquire lock during next iteration when we're connected if self.lock.is_acquired: self.lock.is_acquired = False # make us try to rejoin the party during next iteration when we're connected if self.party.participating: self.party.participating = False # in the meantime we're not leader self.is_leader = None
def open_connection(): global zk zk = KazooClient(hosts=CONNECT_STRING, timeout=50) zk.add_listener(my_listener) zk.start(timeout=150)
class ZooKeeper(object): # Constants used by the REST API: LIVE_NODES_ZKNODE = '/live_nodes' ALIASES = '/aliases.json' CLUSTER_STATE = '/clusterstate.json' SHARDS = 'shards' REPLICAS = 'replicas' STATE = 'state' ACTIVE = 'active' LEADER = 'leader' BASE_URL = 'base_url' TRUE = 'true' FALSE = 'false' COLLECTION = 'collection' def __init__(self, zkServerAddress, timeout=15, max_retries=-1, kazoo_client=None): if KazooClient is None: logging.error( 'ZooKeeper requires the `kazoo` library to be installed') raise RuntimeError self.collections = {} self.liveNodes = {} self.aliases = {} self.state = None if kazoo_client is None: self.zk = KazooClient(zkServerAddress, read_only=True, timeout=timeout, command_retry={'max_tries': max_retries}, connection_retry={'max_tries': max_retries}) else: self.zk = kazoo_client self.zk.start() def connectionListener(state): if state == KazooState.LOST: self.state = state elif state == KazooState.SUSPENDED: self.state = state self.zk.add_listener(connectionListener) @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE) def watchClusterState(data, *args, **kwargs): if not data: LOG.warning( "No cluster state available: no collections defined?") else: self.collections = json.loads(data.decode('utf-8')) LOG.info('Updated collections: %s', self.collections) @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE) def watchLiveNodes(children): self.liveNodes = children LOG.info("Updated live nodes: %s", children) @self.zk.DataWatch(ZooKeeper.ALIASES) def watchAliases(data, stat): if data: json_data = json.loads(data.decode('utf-8')) if ZooKeeper.COLLECTION in json_data: self.aliases = json_data[ZooKeeper.COLLECTION] else: LOG.warning('Expected to find %s in alias update %s', ZooKeeper.COLLECTION, json_data.keys()) else: self.aliases = None LOG.info("Updated aliases: %s", self.aliases) def getHosts(self, collname, only_leader=False, seen_aliases=None): if self.aliases and collname in self.aliases: return self.getAliasHosts(collname, only_leader, seen_aliases) hosts = [] if collname not in self.collections: raise SolrError("Unknown collection: %s" % collname) collection = self.collections[collname] shards = collection[ZooKeeper.SHARDS] for shardname in shards.keys(): shard = shards[shardname] if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE: replicas = shard[ZooKeeper.REPLICAS] for replicaname in replicas.keys(): replica = replicas[replicaname] if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE: if not only_leader or (replica.get( ZooKeeper.LEADER, None) == ZooKeeper.TRUE): base_url = replica[ZooKeeper.BASE_URL] if base_url not in hosts: hosts.append(base_url) return hosts def getAliasHosts(self, collname, only_leader, seen_aliases): if seen_aliases: if collname in seen_aliases: LOG.warn("%s in circular alias definition - ignored", collname) return [] else: seen_aliases = [] seen_aliases.append(collname) collections = self.aliases[collname].split(",") hosts = [] for collection in collections: for host in self.getHosts(collection, only_leader, seen_aliases): if host not in hosts: hosts.append(host) return hosts def getRandomURL(self, collname, only_leader=False): hosts = self.getHosts(collname, only_leader=only_leader) if not hosts: raise SolrError('ZooKeeper returned no active shards!') return '%s/%s' % (random.choice(hosts), collname) def getLeaderURL(self, collname): return self.getRandomURL(collname, only_leader=True)
return str(value.decode()) def to_response(string): return string.content.decode('utf-8').strip("\"") def hash_function(string): return sum(map(ord, list(string))) try: logging.basicConfig() zkr = KazooRetry(max_tries=-1) client = KazooClient(hosts="127.0.0.1:2181", connection_retry=zkr) client.add_listener(zk_status_listener) client.start() if client.exists("/servers/master"): @client.ChildrenWatch("/servers/") def become_master(children): if "master" not in children: instances = client.get_children("/servers/slaves/") if instances[0].split("_")[-1] == PORT: print(">>> This server is the new master") client.create("/servers/master", ephemeral=True) client.set("/servers/master", PORT.encode()) if client.exists("/servers/status"): client.delete("/servers/status")
class ConsistentScheduler(object): ''' LibPartitionHelper abstract out workers and work_items, and their mapping to partitions. So application can only deal with the work items it owns, without bothering about partition mapping. This class also provides syncronization premitives to ensure apps to clean up b4 giving up their partitions ''' _MAX_WAIT_4_ALLOCATION = 6 + randint(0, 9) def __init__(self, service_name=None, zookeeper='127.0.0.1:2181', delete_hndlr=None, add_hndlr=None, bucketsize=47, item2part_func=None, partitioner=None, logger=None, cluster_id=''): if logger: self._logger = logger else: self._logger = logging.getLogger(__name__) self._service_name = service_name or os.path.basename(sys.argv[0]) self._item2part_func = item2part_func or self._device2partition self._zookeeper_srvr = zookeeper self._bucketsize = bucketsize self._delete_hndlr = delete_hndlr self._add_hndlr = add_hndlr self._partitioner = partitioner or self._partitioner_func self._partitions = {} self._con_hash = None self._last_log = '' self._last_log_cnt = 0 self._partition_set = map(str, range(self._bucketsize)) self._cluster_id = cluster_id if self._cluster_id: self._zk_path = '/' + self._cluster_id + '/contrail_cs' + '/' + self._service_name else: self._zk_path = '/'.join(['/contrail_cs', self._service_name]) self._zk = KazooClient(self._zookeeper_srvr, handler=SequentialGeventHandler()) self._zk.add_listener(self._zk_lstnr) self._conn_state = None while True: try: self._zk.start() break except gevent.event.Timeout as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) # Zookeeper is also throwing exception due to delay in master election except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) self._pc = self._zk.SetPartitioner(path=self._zk_path, set=self._partition_set, partition_func=self._partitioner) self._wait_allocation = 0 gevent.sleep(0) def _sandesh_connection_info_update(self, status, message): from pysandesh.connection_info import ConnectionState from pysandesh.gen_py.process_info.ttypes import ConnectionStatus, \ ConnectionType new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER, name='Zookeeper', status=new_conn_state, message=message, server_addrs=self._zookeeper_srvr.split(',')) if ((self._conn_state and self._conn_state != ConnectionStatus.DOWN) and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' % (message) self._supress_log(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._supress_log(msg) self._conn_state = new_conn_state # end _sandesh_connection_info_update def _zk_lstnr(self, state): if state == KazooState.CONNECTED: # Update connection info self._sandesh_connection_info_update( status='UP', message='Connection to Zookeeper established') elif state == KazooState.LOST: # Lost the session with ZooKeeper Server # Best of option we have is to exit the process and restart all # over again self._sandesh_connection_info_update( status='DOWN', message='Connection to Zookeeper lost') os._exit(2) elif state == KazooState.SUSPENDED: # Update connection info self._sandesh_connection_info_update( status='INIT', message='Connection to zookeeper lost. Retrying') def schedule(self, items, lock_timeout=30): gevent.sleep(0) ret = False if self._pc.failed: self._logger.error('Lost or unable to acquire partition') os._exit(2) elif self._pc.release: self._supress_log('Releasing...') self._release() elif self._pc.allocating: self._supress_log('Waiting for allocation...') self._pc.wait_for_acquire(lock_timeout) if self._wait_allocation < self._MAX_WAIT_4_ALLOCATION: self._wait_allocation += 1 else: self._logger.error('Giving up after %d tries!' % (self._wait_allocation)) os._exit(2) elif self._pc.acquired: self._supress_log('got work: ', list(self._pc)) ret = True self._wait_allocation = 0 self._populate_work_items(items) self._supress_log('work items: ', self._items2name(self.work_items()), 'from the list', self._items2name(items)) return ret def members(self): return list(self._con_hash.nodes) def partitions(self): return list(self._pc) def work_items(self): return sum(self._partitions.values(), []) def finish(self): self._inform_delete(self._partitions.keys()) self._pc.finish() def _items2name(self, items): return map(lambda x: x.name, items) def _supress_log(self, *s): slog = ' '.join(map(str, s)) dl = '' if slog != self._last_log_cnt: if self._last_log_cnt: dl += ' ' * 4 dl += '.' * 8 dl += '[last print repeats %d times]' % self._last_log_cnt self._last_log_cnt = 0 dl += slog self._last_log = slog self._logger.debug(dl) else: self._last_log_cnt += 1 def _consistent_hash(self, members): if self._con_hash is None: self._con_hash = ConsistentHash(members) self._logger.error('members: %s' % (str(self._con_hash.nodes))) cur, updtd = set(self._con_hash.nodes), set(members) if cur != updtd: newm = updtd - cur rmvd = cur - updtd if newm: self._logger.error('new members: %s' % (str(newm))) self._con_hash.add_nodes(list(newm)) if rmvd: self._logger.error('members left: %s' % (str(rmvd))) self._con_hash.del_nodes(list(rmvd)) return self._con_hash def _consistent_hash_get_node(self, members, partition): return self._consistent_hash(members).get_node(partition) def _partitioner_func(self, identifier, members, _partitions): partitions = [p for p in _partitions \ if self._consistent_hash_get_node(members, p) == identifier] self._logger.error('partitions: %s' % (str(partitions))) return partitions def _release(self): old = set(self._pc) new = set( self._partitioner(self._pc._identifier, list(self._pc._party), self._partition_set)) rmvd = old - new added = new - old if rmvd: self._inform_delete(list(rmvd)) if added: self._inform_will_add(list(added)) self._pc.release_set() def _list_items_in(self, partitions): return sum([self._partitions[k] for k in partitions if k in \ self._partitions], []) def _inform_will_add(self, partitions): if callable(self._add_hndlr): self._add_hndlr(self._list_items_in(partitions)) def _inform_delete(self, partitions): if callable(self._delete_hndlr): self._delete_hndlr(self._list_items_in(partitions)) def _populate_work_items(self, items): self._refresh_work_items() for i in items: part = str(self._item2part_func(i.name)) if part in list(self._pc): if part not in self._partitions: self._partitions[part] = [] if i.name not in map(lambda x: x.name, self._partitions[part]): self._partitions[part].append(i) self._logger.debug('@populate_work_items(%s): done!' % ' '.join( map( lambda v: str(v[0]) + ':' + ','.join( map(lambda x: x.name, v[1])), self._partitions.items()))) gevent.sleep(0) def _device2partition(self, key): return struct.unpack( 'Q', hashlib.md5(key).digest()[-8:])[0] % self._bucketsize def _refresh_work_items(self): for k in self._partitions: self._partitions[k] = []
class USSMetadataManager(object): """Interfaces with the locking system to get, put, and delete USS metadata. Metadata gets/stores/deletes the USS information for a partiular grid, including current version number, a list of USSs with active operations, and the endpoints to get that information. Locking is assured through a snapshot token received when getting, and used when putting. """ def __init__(self, connectionstring=DEFAULT_CONNECTION, testgroupid=None): """Initializes the class. Args: connectionstring: Zookeeper connection string - server:port,server:port,... testgroupid: ID to use if in test mode, none for normal mode """ if testgroupid: self.set_testmode(testgroupid) if not connectionstring: connectionstring = DEFAULT_CONNECTION log.debug( 'Creating metadata manager object and connecting to zookeeper...') try: if set(BAD_CHARACTER_CHECK) & set(connectionstring): raise ValueError self.zk = KazooClient(hosts=connectionstring, timeout=CONNECTION_TIMEOUT) self.zk.add_listener(self.zookeeper_connection_listener) self.zk.start() if testgroupid: self.delete_testdata(testgroupid) except KazooTimeoutError: log.error( 'Unable to connect to zookeeper using %s connection string...', connectionstring) raise except ValueError: log.error('Connection string %s seems invalid...', connectionstring) raise def __del__(self): log.debug( 'Destroying metadata manager object and disconnecting from zk...') self.zk.stop() def get_state(self): return self.zk.state def get_version(self): try: return True, self.zk.server_version() except KazooException as e: msg = str(e) return False, type(e).__name__ + (' ' + msg if msg else '') def set_verbose(self): log.setLevel(logging.DEBUG) def set_testmode(self, testgroupid='UNDEFINED_TESTER'): """Sets the mode to testing with the specific test ID, cannot be undone. Args: testgroupid: ID to use if in test mode, none for normal mode """ global GRID_PATH global CONNECTION_TIMEOUT # Adjust parameters specifically for the test GRID_PATH = TEST_BASE_PREFIX + testgroupid + USS_BASE_PREFIX log.debug('Setting test path to %s...', GRID_PATH) CONNECTION_TIMEOUT = 1.0 def zookeeper_connection_listener(self, state): if state == KazooState.LOST: # Register somewhere that the session was lost log.error('Lost connection with the zookeeper servers...') elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper log.error('Suspended connection with the zookeeper servers...') elif state == KazooState.CONNECTED: # Handle being connected/reconnected to Zookeeper log.info('Connection restored with the zookeeper servers...') def delete_testdata(self, testgroupid=None): """Removes the test data from the servers. Be careful when using this in parallel as it removes everything under the testgroupid, or everything if no tetgroupid is provided. Args: testgroupid: ID to use if in test mode, none will remove all test data """ if testgroupid: path = TEST_BASE_PREFIX + testgroupid else: path = TEST_BASE_PREFIX self.zk.delete(path, recursive=True) def get(self, z, x, y): """Gets the metadata and snapshot token for a GridCell. Reads data from zookeeper, including a snapshot token. The snapshot token is used as a reference when writing to ensure the data has not been updated between read and write. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ # TODO(hikevin): Change to use our own error codes and let the server # convert them to http error codes. For now, this is # at least in a standard JSend format. status = 500 if slippy_util.validate_slippy(z, x, y): (content, metadata) = self._get_raw(z, x, y) if metadata: try: m = uss_metadata.USSMetadata(content) status = 200 result = { 'status': 'success', 'sync_token': metadata.last_modified_transaction_id, 'data': m.to_json() } except ValueError: status = 424 else: status = 404 else: status = 400 if status != 200: result = self._format_status_code_to_jsend(status) return result def set(self, z, x, y, sync_token, uss_id, ws_scope, operation_format, operation_ws, earliest_operation, latest_operation): """Sets the metadata for a GridCell. Writes data, using the snapshot token for confirming data has not been updated since it was last read. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format sync_token: token retrieved in the original GET GridCellMetadata, uss_id: plain text identifier for the USS, ws_scope: scope to use to obtain OAuth token, operation_format: output format for operation ws (i.e. NASA, GUTMA), operation_ws: submitting USS endpoint where all flights in this cell can be retrieved from, earliest_operation: lower bound of active or planned flight timestamp, used for quick filtering conflicts. latest_operation: upper bound of active or planned flight timestamp, used for quick filtering conflicts. Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ if slippy_util.validate_slippy(z, x, y): # first we have to get the cell (content, metadata) = self._get_raw(z, x, y) if metadata: # Quick check of the token, another is done on the actual set to be sure # but this check fails early and fast if str(metadata.last_modified_transaction_id) == str( sync_token): try: m = uss_metadata.USSMetadata(content) log.debug('Setting metadata for %s...', uss_id) if not m.upsert_operator( uss_id, ws_scope, operation_format, operation_ws, earliest_operation, latest_operation, z, x, y): log.error( 'Failed setting operator for %s with token %s...', uss_id, str(sync_token)) raise ValueError status = self._set_raw(z, x, y, m, metadata.version) except ValueError: status = 424 else: status = 409 else: status = 404 else: status = 400 if status == 200: # Success, now get the metadata back to send back result = self.get(z, x, y) else: result = self._format_status_code_to_jsend(status) return result def delete(self, z, x, y, uss_id): """Sets the metadata for a GridCell by removing the entry for the USS. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format uss_id: is the plain text identifier for the USS Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ status = 500 if slippy_util.validate_slippy(z, x, y): # first we have to get the cell (content, metadata) = self._get_raw(z, x, y) if metadata: try: m = uss_metadata.USSMetadata(content) m.remove_operator(uss_id) # TODO(pelletierb): Automatically retry on delete status = self._set_raw(z, x, y, m, metadata.version) except ValueError: status = 424 else: status = 404 else: status = 400 if status == 200: # Success, now get the metadata back to send back (content, metadata) = self._get_raw(z, x, y) result = { 'status': 'success', 'sync_token': metadata.last_modified_transaction_id, 'data': m.to_json() } else: result = self._format_status_code_to_jsend(status) return result def get_multi(self, z, grids): """Gets the metadata and snapshot token for multiple GridCells. Reads data from zookeeper, including a composite snapshot token. The snapshot token is used as a reference when writing to ensure the data has not been updated between read and write. Args: z: zoom level in slippy tile format grids: list of (x,y) tiles to retrieve Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ try: combined_meta, syncs = self._get_multi_raw(z, grids) log.debug('Found sync token %s for %d grids...', self._hash_sync_tokens(syncs), len(syncs)) result = { 'status': 'success', 'sync_token': self._hash_sync_tokens(syncs), 'data': combined_meta.to_json() } except ValueError as e: result = self._format_status_code_to_jsend(400, e.message) except IndexError as e: result = self._format_status_code_to_jsend(404, e.message) return result def set_multi(self, z, grids, sync_token, uss_id, ws_scope, operation_format, operation_ws, earliest_operation, latest_operation): """Sets multiple GridCells metadata at once. Writes data, using the hashed snapshot token for confirming data has not been updated since it was last read. Args: z: zoom level in slippy tile format grids: list of (x,y) tiles to update sync_token: token retrieved in the original get_multi, uss_id: plain text identifier for the USS, ws_scope: scope to use to obtain OAuth token, operation_format: output format for operation ws (i.e. NASA, GUTMA), operation_ws: submitting USS endpoint where all flights in this cell can be retrieved from, earliest_operation: lower bound of active or planned flight timestamp, used for quick filtering conflicts. latest_operation: upper bound of active or planned flight timestamp, used for quick filtering conflicts. Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ log.debug('Setting multiple grid metadata for %s...', uss_id) try: # first, get the affected grid's sync tokens m, syncs = self._get_multi_raw(z, grids) del m # Quick check of the token, another is done on the actual set to be sure # but this check fails early and fast log.debug('Found sync token %d for %d grids...', self._hash_sync_tokens(syncs), len(syncs)) if str(self._hash_sync_tokens(syncs)) == str(sync_token): log.debug('Composite sync_token matches, continuing...') self._set_multi_raw(z, grids, syncs, uss_id, ws_scope, operation_format, operation_ws, earliest_operation, latest_operation) log.debug('Completed updating multiple grids...') else: raise KeyError('Composite sync_token has changed') combined_meta, new_syncs = self._get_multi_raw(z, grids) result = { 'status': 'success', 'sync_token': self._hash_sync_tokens(new_syncs), 'data': combined_meta.to_json() } except (KeyError, RolledBackError) as e: result = self._format_status_code_to_jsend(409, e.message) except ValueError as e: result = self._format_status_code_to_jsend(400, e.message) except IndexError as e: result = self._format_status_code_to_jsend(404, e.message) return result def delete_multi(self, z, grids, uss_id): """Sets multiple GridCells metadata by removing the entry for the USS. Removes the operator from multiple cells. Does not return 404 on not finding the USS in a cell, since this should be a remove all type function, as some cells might have the ussid and some might not. Args: z: zoom level in slippy tile format grids: list of (x,y) tiles to delete uss_id: is the plain text identifier for the USS Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ log.debug('Deleting multiple grid metadata for %s...', uss_id) try: if not uss_id: raise ValueError('Invalid uss_id for deleting multi') for x, y in grids: if slippy_util.validate_slippy(z, x, y): (content, metadata) = self._get_raw(z, x, y) if metadata: m = uss_metadata.USSMetadata(content) m.remove_operator(uss_id) # TODO(pelletierb): Automatically retry on delete status = self._set_raw(z, x, y, m, metadata.version) else: raise ValueError('Invalid slippy grids for lookup') result = self.get_multi(z, grids) except ValueError as e: result = self._format_status_code_to_jsend(400, e.message) return result ###################################################################### ################ INTERNAL FUNCTIONS ######################### ###################################################################### def _get_raw(self, z, x, y): """Gets the raw content and metadata for a GridCell from zookeeper. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format Returns: content: USS metadata metadata: straight from zookeeper """ path = '%s/%s/%s/%s/%s' % (GRID_PATH, str(z), str(x), str(y), USS_METADATA_FILE) log.debug('Getting metadata from zookeeper@%s...', path) try: c, m = self.zk.get(path) except NoNodeError: self.zk.ensure_path(path) c, m = self.zk.get(path) if c: log.debug('Received raw content and metadata from zookeeper: %s', c) if m: log.debug('Received raw metadata from zookeeper: %s', m) return c, m def _set_raw(self, z, x, y, m, version): """Grabs the lock and updates the raw content for a GridCell in zookeeper. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format m: metadata object to write version: the metadata version verified from the sync_token match Returns: 200 for success, 409 for conflict, 408 for unable to get the lock """ path = '%s/%s/%s/%s/%s' % (GRID_PATH, str(z), str(x), str(y), USS_METADATA_FILE) try: log.debug('Setting metadata to %s...', str(m)) self.zk.set(path, json.dumps(m.to_json()), version) status = 200 except BadVersionError: log.error('Sync token updated before write for %s...', path) status = 409 return status def _get_multi_raw(self, z, grids): """Gets the raw content and metadata for multiple GridCells from zookeeper. Args: z: zoom level in slippy tile format grids: list of (x,y) tiles to retrieve Returns: content: Combined USS metadata syncs: list of sync tokens in the same order as the grids Raises: IndexError: if it cannot find anything in zookeeper ValueError: if the grid data is not in the right format """ log.debug('Getting multiple grid metadata for %s...', str(grids)) combined_meta = None syncs = [] for x, y in grids: if slippy_util.validate_slippy(z, x, y): (content, metadata) = self._get_raw(z, x, y) if metadata: combined_meta += uss_metadata.USSMetadata(content) syncs.append(metadata.last_modified_transaction_id) else: raise IndexError('Unable to find metadata in platform') else: raise ValueError('Invalid slippy grids for lookup') if len(syncs) == 0: raise IndexError('Unable to find metadata in platform') return combined_meta, syncs def _set_multi_raw(self, z, grids, sync_tokens, uss_id, ws_scope, operation_format, operation_ws, earliest_operation, latest_operation): """Grabs the lock and updates the raw content for multiple GridCells Args: z: zoom level in slippy tile format grids: list of (x,y) tiles to retrieve sync_tokens: list of the sync tokens received during get operation uss_id: plain text identifier for the USS, ws_scope: scope to use to obtain OAuth token, operation_format: output format for operation ws (i.e. NASA, GUTMA), operation_ws: submitting USS endpoint where all flights in this cell can be retrieved from, earliest_operation: lower bound of active or planned flight timestamp, used for quick filtering conflicts. latest_operation: upper bound of active or planned flight timestamp, used for quick filtering conflicts. Raises: IndexError: if it cannot find anything in zookeeper ValueError: if the grid data is not in the right format """ log.debug('Setting multiple grid metadata for %s...', str(grids)) try: contents = [] for i in range(len(grids)): # First, get and update them all in memory, validate the sync_token x = grids[i][0] y = grids[i][1] sync_token = sync_tokens[i] path = '%s/%s/%s/%s/%s' % (GRID_PATH, str(z), str(x), str(y), USS_METADATA_FILE) (content, metadata) = self._get_raw(z, x, y) if str(metadata.last_modified_transaction_id) == str( sync_token): log.debug('Sync_token matches for %d, %d...', x, y) m = uss_metadata.USSMetadata(content) if not m.upsert_operator( uss_id, ws_scope, operation_format, operation_ws, earliest_operation, latest_operation, z, x, y): raise ValueError('Failed to set operator content') contents.append((path, m, metadata.version)) else: log.error( 'Sync token from USS (%s) does not match token from zk (%s)...', str(sync_token), str(metadata.last_modified_transaction_id)) raise KeyError('Composite sync_token has changed') # Now, start a transaction to update them all # the version will catch any changes and roll back any attempted # updates to the grids log.debug('Starting transaction to write all grids at once...') t = self.zk.transaction() for path, m, version in contents: t.set_data(path, json.dumps(m.to_json()), version) log.debug('Committing transaction...') results = t.commit() if isinstance(results[0], RolledBackError): raise KeyError( 'Rolled back multi-grid transaction due to grid change') log.debug('Committed transaction successfully.') except (KeyError, ValueError, IndexError) as e: log.error('Error caught in set_multi_raw %s.', e.message) raise e def _format_status_code_to_jsend(self, status, message=None): """Formats a response based on HTTP status code. Args: status: HTTP status code message: optional message to override preset message for codes Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ if status == 200 or status == 204: result = { 'status': 'success', 'code': 204, 'message': 'Empty data set.' } elif status == 400: result = { 'status': 'fail', 'code': status, 'message': 'Parameters are not following the correct format.' } elif status == 404: result = { 'status': 'fail', 'code': status, 'message': 'Unable to pull metadata from lock system.' } elif status == 408: result = { 'status': 'fail', 'code': status, 'message': 'Timeout trying to get lock.' } elif status == 409: result = { 'status': 'fail', 'code': status, 'message': 'Content in metadata has been updated since provided sync token.' } elif status == 424: result = { 'status': 'fail', 'code': status, 'message': 'Content in metadata is not following JSON format guidelines.' } else: result = { 'status': 'fail', 'code': status, 'message': 'Unknown error code occurred.' } if message: result['message'] = message return result @staticmethod def _hash_sync_tokens(syncs): """Hashes a list of sync tokens into a single, positive 64-bit int. For various languages, the limit to integers may be different, therefore we truncate to ensure the hash is the same on all implementations. """ return abs(hash(tuple(sorted(syncs)))) % MAX_SAFE_INTEGER
class ClusterZookeeper(object): def __init__(self, zookeeper_hosts, kafka_hosts): self.groups_dict = {} self.topics_dict = {} self.brokers_list = [] self.consumer = KafkaConsumer(bootstrap_servers=kafka_hosts.split(',')) self.zk = KazooClient(hosts=zookeeper_hosts) self.zk.add_listener(self.keep_start) self.zk.start() if self.zk.exists('/consumers') is None or self.zk.exists('/brokers') is None: raise ValueError(zookeeper_hosts + 'is not zookeeper of kafka') ChildrenWatch(self.zk, '/consumers', self.groups_watch) ChildrenWatch(self.zk, '/brokers/topics', self.topics_watch) ChildrenWatch(self.zk, '/brokers/ids/', self.brokers_watch) t = threading.Thread(target=self.latest, name=kafka_hosts) t.setDaemon(True) t.start() # 保证链接是可用的 def keep_start(self, client_status): if client_status != 'CONNECTED': try: self.zk.start() except(): pass # 监听consumers节点 def groups_watch(self, children): for group in [group for group in self.groups_dict.keys() if group not in children]: self.groups_dict.pop(group) for group in [group for group in children if group not in self.groups_dict.keys()]: owners_p = '/consumers/' + group + '/owners' if self.zk.exists(owners_p) is None: continue g_o_t = GroupOwnersTopic() self.groups_dict[group] = g_o_t ChildrenWatch(self.zk, owners_p, g_o_t.g_topic_watch) # 监听topic节点 def topics_watch(self, children): for topic in [topic for topic in self.topics_dict.keys() if topic not in children]: self.topics_dict.pop(topic) for topic in [topic for topic in children if topic not in self.topics_dict.keys()]: t_v = TopicValue() self.topics_dict[topic] = t_v DataWatch(self.zk, '/brokers/topics/' + topic, t_v.topic_watch) t_v.topic_partition = [TopicPartition(topic, p) for p in self.consumer.partitions_for_topic(topic)] # 监听broker节点 def brokers_watch(self, children): self.brokers_list = children def close_zk(self): try: self.zk.remove_listener(self.keep_start) self.zk.stop() self.zk.close() except(): pass def latest(self): while True: # time.sleep(0.1) time.sleep(0.001) for k, v in self.topics_dict.items(): try: partitions = v.topic_partition self.consumer.assign(partitions) self.consumer.seek_to_end(*partitions) log_offset = reduce(lambda x, y: x + y, [self.consumer.position(p) for p in partitions]) now_timestamp = int(time.mktime(time.localtime())) if 'timestamp' in v.__dict__ and v.timestamp is not None: v.speed = (log_offset - v.off_set) / (now_timestamp - v.timestamp) v.timestamp = now_timestamp v.off_set = log_offset except Exception as e: pass