class ZKClient(object): def __init__(self, zk_hosts, local_port, local_host=None, server_info_path='/crawlers'): self.zk_hosts = zk_hosts self.local_host = local_host if not self.local_host: self.local_host = get_local_host() self.server_info_path = server_info_path self.server_info_znode = '{server_info_path}/{ip}:{port}'.format( server_info_path=server_info_path, ip=self.local_host, port=local_port) self.connect_zk() def connect_zk(self): self.zk = KazooClient(hosts=self.zk_hosts) self.zk.add_listener(self.state_listener) self.zk.start() def update_heartbeat(self): def callback(async_stat): stat = async_stat.get() if stat: _ = self.zk.set_async(self.server_info_znode, ts) else: _ = self.zk.create_async(self.server_info_znode, ts, ephemeral=True, makepath=True) ts = str(int(time.time())) async_stat = self.zk.exists_async(self.server_info_znode, watch=None) async_stat.rawlink(callback) def state_listener(self, state): if state == KazooState.LOST: # Register somewhere that the session was lost pass elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper pass else: self.update_heartbeat() def add_watcher(self, request_handler): @self.zk.DataWatch(ZOO_CONFIG_PROXY_PATH) def proxy_change(data, stat): run_in_thread(request_handler.config_proxy_via_zookeeper, data) def close(self): self.zk.stop()
class NodeMonitor: STATIC_NODE_ID = 0 global t def __init__(self): self.zk = None self.SERVER_IP_AND_PORT = "localhost:2181" self.NODE_ID = str(NodeMonitor.STATIC_NODE_ID) NodeMonitor.STATIC_NODE_ID += 1 def start_zk(self): self.zk = KazooClient(hosts=self.SERVER_IP_AND_PORT) self.zk.add_listener(self._connection_listener) self.zk.start(); self.zk.ensure_path("/monitorData/"+ self.NODE_ID) def start_update_info(self): t = threading.Timer(0.0, self._update_info) t.start() def _update_info_once(self): cmi = CollectMachineInfo() async_obj = self.zk.set_async("/monitorData/"+ self.NODE_ID, (cmi.collectInfo()).encode(encoding="utf-8")) async_obj.rawlink(self._update_info_callback) def _connection_listener(self, state): if state == KazooState.LOST: print "connection lost, going to connect again" self.start_zk(); elif state == KazooState.SUSPENDED: print "suspended" else: print "connected ok" def _update_info_callback(self, async_obj): try: print "update success" except (ConnectionLossException, NoAuthException): print "exception!" def _update_info(self): print "begin to update" self._update_info_once() t = threading.Timer(5.0, self._update_info) t.start()
class ZooKeeper(AbstractDCS): def __init__(self, config): super(ZooKeeper, self).__init__(config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) self._client = KazooClient(hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']), timeout=config['ttl'], connection_retry={'max_delay': 1, 'max_tries': -1}, command_retry={'deadline': config['retry_timeout'], 'max_delay': 1, 'max_tries': -1}) self._client.add_listener(self.session_listener) self._my_member_data = None self._fetch_cluster = True self._last_leader_operation = 0 self._orig_kazoo_connect = self._client._connection._connect self._client._connection._connect = self._kazoo_connect self._client.start() def _kazoo_connect(self, host, port): """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no response on Ping after Ping interval (1/2 from read_timeout) it will consider current connection dead and try to connect to another node. Without this "magic" it was taking up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had only small time for reconnect and retry. This method is needed to return different value of read_timeout, which is not calculated from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and `write_leader_optime()` methods, which also may hang...""" ret = self._orig_kazoo_connect(host, port) return max(self.loop_wait - 2, 2)*1000, ret[1] def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def cluster_watcher(self, event): self._fetch_cluster = True self.event.set() def reload_config(self, config): self.set_retry_timeout(config['retry_timeout']) loop_wait = config['loop_wait'] loop_wait_changed = self._loop_wait != loop_wait self._loop_wait = loop_wait self._client.handler.set_connect_timeout(loop_wait) # We need to reestablish connection to zookeeper if we want to change # read_timeout (and Ping interval respectively), because read_timeout # is calculated in `_kazoo_connect` method. If we are changing ttl at # the same time, set_ttl method will reestablish connection and return # `!True`, otherwise we will close existing connection and let kazoo # open the new one. if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed: self._client._connection._socket.close() def set_ttl(self, ttl): """It is not possible to change ttl (session_timeout) in zookeeper without destroying old session and creating the new one. This method returns `!True` if session_timeout has been changed (`restart()` has been called).""" if self._client._session_timeout != ttl: self._client._session_timeout = ttl self._client.restart() return True def set_retry_timeout(self, retry_timeout): self._client._retry.deadline = retry_timeout def get_node(self, key, watch=None): try: ret = self._client.get(key, watch) return (ret[0].decode('utf-8'), ret[1]) except NoNodeError: return None @staticmethod def member(name, value, znode): return Member.from_node(znode.version, name, znode.ephemeralOwner, value) def get_children(self, key, watch=None): try: return self._client.get_children(key, watch) except NoNodeError: return [] def load_members(self): members = [] for member in self.get_children(self.members_path, self.cluster_watcher): data = self.get_node(self.members_path + member) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set(self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get global dynamic configuration config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid) # get list of members members = self.load_members() if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node(self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner: logger.info('I am leader but not owner of the session. Removing leader node') self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # failover key failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None failover = failover and Failover.from_node(failover[1].version, failover[0]) # get last leader operation optime = self.get_node(self.leader_optime_path) if self._OPTIME in nodes and self._fetch_cluster else None self._last_leader_operation = 0 if optime is None else int(optime[0]) self._cluster = Cluster(initialize, config, leader, self._last_leader_operation, members, failover) def _load_cluster(self): if self._fetch_cluster or self._cluster is None: try: self._client.retry(self._inner_load_cluster) except Exception: logger.exception('get_cluster') self.cluster_watcher(None) raise ZooKeeperError('ZooKeeper in not responding properly') def _create(self, path, value, **kwargs): try: self._client.retry(self._client.create, path, value.encode('utf-8'), **kwargs) return True except: return False def attempt_to_acquire_leader(self, permanent=False): ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=not permanent) if not ret: logger.info('Could not take out TTL lock') return ret def set_failover_value(self, value, index=None): try: self._client.retry(self._client.set, self.failover_path, value.encode('utf-8'), version=index or -1) return True except NoNodeError: return value == '' or (index is None and self._create(self.failover_path, value)) except: logging.exception('set_failover_value') return False def set_config_value(self, value, index=None): try: self._client.retry(self._client.set, self.config_path, value.encode('utf-8'), version=index or -1) return True except NoNodeError: return index is None and self._create(self.config_path, value) except Exception: logging.exception('set_config_value') return False def initialize(self, create_new=True, sysid=""): return self._create(self.initialize_path, sysid, makepath=True) if create_new \ else self._client.retry(self._client.set, self.initialize_path, sysid.encode("utf-8")) def touch_member(self, data, ttl=None, permanent=False): cluster = self.cluster member = cluster and ([m for m in cluster.members if m.name == self._name] or [None])[0] data = data.encode('utf-8') if member and self._client.client_id is not None and member.session != self._client.client_id[0]: try: self._client.delete_async(self.member_path).get(timeout=1) except NoNodeError: pass except: return False member = None if member: if data == self._my_member_data: return True else: try: self._client.create_async(self.member_path, data, makepath=True, ephemeral=not permanent).get(timeout=1) self._my_member_data = data return True except Exception as e: if not isinstance(e, NodeExistsError): logger.exception('touch_member') return False try: self._client.set_async(self.member_path, data).get(timeout=1) self._my_member_data = data return True except: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def write_leader_optime(self, last_operation): last_operation = last_operation.encode('utf-8') if last_operation != self._last_leader_operation: try: self._client.set_async(self.leader_optime_path, last_operation).get(timeout=1) self._last_leader_operation = last_operation except NoNodeError: try: self._client.create_async(self.leader_optime_path, last_operation, makepath=True).get(timeout=1) self._last_leader_operation = last_operation except: logger.exception('Failed to create %s', self.leader_optime_path) except: logger.exception('Failed to update %s', self.leader_optime_path) def update_leader(self): return True def delete_leader(self): self._client.restart() self._my_member_data = None return True def _cancel_initialization(self): node = self.get_node(self.initialize_path) if node: self._client.delete(self.initialize_path, version=node[1].version) def cancel_initialization(self): try: self._client.retry(self._cancel_initialization) except: logger.exception("Unable to delete initialize key") def delete_cluster(self): try: return self._client.retry(self._client.delete, self.client_path(''), recursive=True) except NoNodeError: return True def watch(self, timeout): if super(ZooKeeper, self).watch(timeout): self._fetch_cluster = True return self._fetch_cluster
class ZooKeeper(AbstractDCS): def __init__(self, config): super(ZooKeeper, self).__init__(config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) mapping = { 'use_ssl': 'use_ssl', 'verify': 'verify_certs', 'cacert': 'ca', 'cert': 'certfile', 'key': 'keyfile', 'key_password': '******' } kwargs = {v: config[k] for k, v in mapping.items() if k in config} self._client = KazooClient( hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']), timeout=config['ttl'], connection_retry=KazooRetry(max_delay=1, max_tries=-1, sleep_func=time.sleep), command_retry=KazooRetry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1, sleep_func=time.sleep), **kwargs) self._client.add_listener(self.session_listener) self._fetch_cluster = True self._fetch_optime = True self._orig_kazoo_connect = self._client._connection._connect self._client._connection._connect = self._kazoo_connect self._client.start() def _kazoo_connect(self, *args): """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no response on Ping after Ping interval (1/2 from read_timeout) it will consider current connection dead and try to connect to another node. Without this "magic" it was taking up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had only small time for reconnect and retry. This method is needed to return different value of read_timeout, which is not calculated from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and `write_leader_optime()` methods, which also may hang...""" ret = self._orig_kazoo_connect(*args) return max(self.loop_wait - 2, 2) * 1000, ret[1] def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def optime_watcher(self, event): self._fetch_optime = True self.event.set() def cluster_watcher(self, event): self._fetch_cluster = True self.optime_watcher(event) def reload_config(self, config): self.set_retry_timeout(config['retry_timeout']) loop_wait = config['loop_wait'] loop_wait_changed = self._loop_wait != loop_wait self._loop_wait = loop_wait self._client.handler.set_connect_timeout(loop_wait) # We need to reestablish connection to zookeeper if we want to change # read_timeout (and Ping interval respectively), because read_timeout # is calculated in `_kazoo_connect` method. If we are changing ttl at # the same time, set_ttl method will reestablish connection and return # `!True`, otherwise we will close existing connection and let kazoo # open the new one. if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed: self._client._connection._socket.close() def set_ttl(self, ttl): """It is not possible to change ttl (session_timeout) in zookeeper without destroying old session and creating the new one. This method returns `!True` if session_timeout has been changed (`restart()` has been called).""" if self._client._session_timeout != ttl: self._client._session_timeout = ttl self._client.restart() return True @property def ttl(self): return self._client._session_timeout def set_retry_timeout(self, retry_timeout): retry = self._client.retry if isinstance( self._client.retry, KazooRetry) else self._client._retry retry.deadline = retry_timeout def get_node(self, key, watch=None): try: ret = self._client.get(key, watch) return (ret[0].decode('utf-8'), ret[1]) except NoNodeError: return None def get_leader_optime(self, leader): watch = self.optime_watcher if not leader or leader.name != self._name else None optime = self.get_node(self.leader_optime_path, watch) self._fetch_optime = False return optime and int(optime[0]) or 0 @staticmethod def member(name, value, znode): return Member.from_node(znode.version, name, znode.ephemeralOwner, value) def get_children(self, key, watch=None): try: return self._client.get_children(key, watch) except NoNodeError: return [] def load_members(self, sync_standby): members = [] for member in self.get_children(self.members_path, self.cluster_watcher): watch = member in sync_standby and self.cluster_watcher or None data = self.get_node(self.members_path + member, watch) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set( self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get global dynamic configuration config = self.get_node( self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid) # get timeline history history = self.get_node( self.history_path, watch=self.cluster_watcher) if self._HISTORY in nodes else None history = history and TimelineHistory.from_node( history[1].mzxid, history[0]) # get synchronization state sync = self.get_node( self.sync_path, watch=self.cluster_watcher) if self._SYNC in nodes else None sync = SyncState.from_node(sync and sync[1].version, sync and sync[0]) # get list of members sync_standby = sync.leader == self._name and sync.members or [] members = self.load_members( sync_standby) if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node( self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if not self._ctl and leader[0] == self._name and client_id is not None \ and client_id[0] != leader[1].ephemeralOwner: logger.info( 'I am leader but not owner of the session. Removing leader node' ) self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # get last leader operation last_leader_operation = self._OPTIME in nodes and self.get_leader_optime( leader) # failover key failover = self.get_node( self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None failover = failover and Failover.from_node(failover[1].version, failover[0]) return Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history) def _load_cluster(self): cluster = self.cluster if self._fetch_cluster or cluster is None: try: cluster = self._client.retry(self._inner_load_cluster) except Exception: logger.exception('get_cluster') self.cluster_watcher(None) raise ZooKeeperError('ZooKeeper in not responding properly') # Optime ZNode was updated or doesn't exist and we are not leader elif (self._fetch_optime and not self._fetch_cluster or not cluster.last_leader_operation) and\ not (cluster.leader and cluster.leader.name == self._name): try: optime = self.get_leader_optime(cluster.leader) cluster = Cluster(cluster.initialize, cluster.config, cluster.leader, optime, cluster.members, cluster.failover, cluster.sync, cluster.history) except Exception: pass return cluster def _bypass_caches(self): self._fetch_cluster = True def _create(self, path, value, retry=False, ephemeral=False): try: if retry: self._client.retry(self._client.create, path, value, makepath=True, ephemeral=ephemeral) else: self._client.create_async(path, value, makepath=True, ephemeral=ephemeral).get(timeout=1) return True except Exception: logger.exception('Failed to create %s', path) return False def attempt_to_acquire_leader(self, permanent=False): ret = self._create(self.leader_path, self._name.encode('utf-8'), retry=True, ephemeral=not permanent) if not ret: logger.info('Could not take out TTL lock') return ret def _set_or_create(self, key, value, index=None, retry=False, do_not_create_empty=False): value = value.encode('utf-8') try: if retry: self._client.retry(self._client.set, key, value, version=index or -1) else: self._client.set_async(key, value, version=index or -1).get(timeout=1) return True except NoNodeError: if do_not_create_empty and not value: return True elif index is None: return self._create(key, value, retry) else: return False except Exception: logger.exception('Failed to update %s', key) return False def set_failover_value(self, value, index=None): return self._set_or_create(self.failover_path, value, index) def set_config_value(self, value, index=None): return self._set_or_create(self.config_path, value, index, retry=True) def initialize(self, create_new=True, sysid=""): sysid = sysid.encode('utf-8') return self._create(self.initialize_path, sysid, retry=True) if create_new \ else self._client.retry(self._client.set, self.initialize_path, sysid) def touch_member(self, data, permanent=False): cluster = self.cluster member = cluster and cluster.get_member(self._name, fallback_to_leader=False) encoded_data = json.dumps(data, separators=(',', ':')).encode('utf-8') if member and (self._client.client_id is not None and member.session != self._client.client_id[0] or not (deep_compare(member.data.get('tags', {}), data.get('tags', {})) and member.data.get('version') == data.get('version') and member.data.get('checkpoint_after_promote') == data.get('checkpoint_after_promote'))): try: self._client.delete_async(self.member_path).get(timeout=1) except NoNodeError: pass except Exception: return False member = None if member: if deep_compare(data, member.data): return True else: try: self._client.create_async( self.member_path, encoded_data, makepath=True, ephemeral=not permanent).get(timeout=1) return True except Exception as e: if not isinstance(e, NodeExistsError): logger.exception('touch_member') return False try: self._client.set_async(self.member_path, encoded_data).get(timeout=1) return True except Exception: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def _write_leader_optime(self, last_operation): return self._set_or_create(self.leader_optime_path, last_operation) def _update_leader(self): return True def _delete_leader(self): self._client.restart() return True def _cancel_initialization(self): node = self.get_node(self.initialize_path) if node: self._client.delete(self.initialize_path, version=node[1].version) def cancel_initialization(self): try: self._client.retry(self._cancel_initialization) except Exception: logger.exception("Unable to delete initialize key") def delete_cluster(self): try: return self._client.retry(self._client.delete, self.client_path(''), recursive=True) except NoNodeError: return True def set_history_value(self, value): return self._set_or_create(self.history_path, value) def set_sync_state_value(self, value, index=None): return self._set_or_create(self.sync_path, value, index, retry=True, do_not_create_empty=True) def delete_sync_state(self, index=None): return self.set_sync_state_value("{}", index) def watch(self, leader_index, timeout): if super(ZooKeeper, self).watch(leader_index, timeout) and not self._fetch_optime: self._fetch_cluster = True return self._fetch_cluster
if MACHINE_STRING not in children: create_service_async_job = zk.create_async( path=ROOT_PATH + SERVICE_PATH + '/' + MACHINE_STRING, value=MACHINE_AVAIL_SERVICE, ephemeral=True) create_service_async_job.rawlink(create_path_async_callback) else: print "PATH existed." except (ConnectionLossException, NoAuthException): logger.error('Can not connect.') sys.exit(1) if __name__ == '__main__': zk.start() zk.add_listener(my_listener) get_children_async_obj = zk.get_children_async(ROOT_PATH + SERVICE_PATH) get_children_async_obj.rawlink(get_service_path_children_callback) while True: input_data = raw_input() if input_data != foredata: set_value_async = zk.set_async(path=ROOT_PATH + SERVICE_PATH + '/' + MACHINE_STRING, value=input_data) set_value_async.rawlink( partial(set_service_child_value_callback, input_data)) if input_data == 'exit': print 'Exit' sys.exit(1) time.sleep(5)
class Zookeeper: def __init__(self, hosts): self.zk = KazooClient(hosts=hosts, handler=SequentialGeventHandler(), logger=logger) # returns immediately event = self.zk.start_async() # Wait for 30 seconds and see if we're connected event.wait(timeout=30) try: if not self.zk.connected: # Not connected, stop trying to connect self.zk.stop() except (ConnectionLossException, NoAuthException) as error: raise error except Exception as error: raise error @coroutine def get_children(self, node): try: children = self.zk.get_children_async(node) raise Return(children.get()) except Exception as error: raise error @coroutine def get_node(self, node): try: data = self.zk.get_async(node) raise Return(data.get()) except Exception as error: raise error @coroutine def check_path_exist(self, path): try: result = self.zk.exists(path) if result: raise Return(True) else: raise Return(False) except Exception as error: raise error @coroutine def create_path(self, path): try: result = self.zk.ensure_path_async(path) raise Return(result.get()) except Exception as error: raise error @coroutine def create_node(self, path, value): try: result = self.zk.create_async(path=path, value=value, acl=None, ephemeral=True) raise Return(result.get()) except Exception as error: raise error @coroutine def update_node(self, path, value, version=-1): try: result = self.zk.set_async(path, value, version) raise Return(result.get()) except Exception as error: raise error @coroutine def update_node(self, path, value, version=-1): try: result = self.zk.set_async(path, value, version) raise Return(result.get()) except Exception as error: raise error @coroutine def del_node(self, node): try: node_info = self.zk.delete_async(node) raise Return(node_info.get()) except Exception as error: raise error def close(self): self.zk.stop()
class NodeMonitor: global t def __init__(self, argv1): # the client net card eth0 self.static_net_card = 'eth0' self.static_path_for_data_3_second_once_time = "/monitorData" self.static_path_for_data_10_second_once_time = "/monitorDataProcessInfo" self.static_path_for_data_just_one_time = "/monitorDataJustOneTime" self.is_server = False if argv1 == '-server': self.is_server = True self._set_up_server_path() # use mac address to divide different virtual machine self.STATIC_NODE_IP_ADDRESS = get_ip_address(self.static_net_card) self.STATIC_NODE_MAC_ADDRESS = get_mac_address() self.zk = None #self.SERVER_IP_AND_PORT = "127.0.0.1:2181" self.SERVER_IP_AND_PORT = "172.18.229.251:2181" #STATIC_NODE_MAC_ADDRESS = get_mac_address() self.NODE_ID = str(self.STATIC_NODE_IP_ADDRESS) self.NODE_ID_PATH = '/' + self.NODE_ID #print self.NODE_ID def _set_up_server_path(self): # the server net card eth1 self.static_net_card = 'eth1' #self.static_path_for_data_3_second_once_time += "Server" #self.static_path_for_data_10_second_once_time += "Server" #self.static_path_for_data_just_one_time += "Server" def start_zk(self): #self.zk = KazooClient("127.0.0.1:2181") self.zk = KazooClient("172.18.229.251:2181") self.zk.add_listener(self._connection_listener) #self.zk.start(); try: self.zk.start() #print "zk-start-try" except (KazooTimeoutError): print "connect fail, going to reconnect" time.sleep(5.0) self.start_zk() self.zk.ensure_path(self.static_path_for_data_3_second_once_time + self.NODE_ID_PATH) def start_update_info(self): t = threading.Timer(0.0, self._update_info) t.start() t2 = threading.Timer(0.0, self._update_info_10_second_once_time) t2.start() def _update_info_once(self): cmi = CollectMachineInfo(self.is_server) #print self.NODE_ID async_obj = self.zk.set_async(self.static_path_for_data_3_second_once_time + self.NODE_ID_PATH, (cmi.collectInfo()).encode(encoding="utf-8")) async_obj.rawlink(self._update_info_callback) def _connection_listener(self, state): if state == KazooState.LOST: print "connection lost, going to connect again" self.start_zk(); elif state == KazooState.SUSPENDED: print "suspended" else: #print "connected ok" return def _update_info_callback(self, async_obj): try: #print "update success" return except (ConnectionLossException, NoAuthException): print "exception!" def _update_info(self): #print "begin to update" self.zk.ensure_path(self.static_path_for_data_3_second_once_time + self.NODE_ID_PATH) self._update_info_once() t = threading.Timer(1.0, self._update_info) t.start() def _update_info_10_second_once_time(self): self.zk.ensure_path(self.static_path_for_data_10_second_once_time + self.NODE_ID_PATH) self._update_info_once_10_second() t = threading.Timer(10.0, self._update_info_10_second_once_time) t.start() def _update_info_once_10_second(self): cpi = CollectProcessInfo() #print cpi #print self.NODE_ID async_obj = self.zk.set_async(self.static_path_for_data_10_second_once_time + self.NODE_ID_PATH, (cpi._get_process_info()) ) async_obj.rawlink(self._update_info_callback) def _update_info_just_one_time(self): #print "just one time to update" self.zk.ensure_path(self.static_path_for_data_just_one_time + self.NODE_ID_PATH) cmi_just_one_time = CollectMachineInfo(self.is_server) async_obj_just_one_time = self.zk.set_async(self.static_path_for_data_just_one_time + self.NODE_ID_PATH, (cmi_just_one_time.collectInfoJustOneTime()).encode(encoding="utf-8")) async_obj_just_one_time.rawlink(self._update_info_callback)
class Coordinator(object): def __init__(self, zkhosts, root=NODE_HQ_ROOT, alivenode="alive", readonly=False, role=None): """zkhosts: a string or a list. list will be ','.join-ed into a string. root: root node path (any parents must exist, if any) """ self.LOGGER = logging.getLogger("hq.zkcoord") if not isinstance(zkhosts, basestring): zkhosts = ",".join(zkhosts) self.zkhosts = zkhosts self.ROOT = root self.alivenode = alivenode self.readonly = readonly self.nodename = os.uname()[1] self.NODE_SERVERS = self.ROOT + "/servers" self.NODE_ME = self.NODE_SERVERS + "/" + self.nodename self.NODE_MYJOBS = self.NODE_ME + "/jobs" self.NODE_GJOBS = self.ROOT + "/jobs" self.__listeners = {} self.jobs = {} self.zh = None self.zstate = None self._connect() def _connect(self): try: if self.zh: self.zh.stop() self.LOGGER.debug("connecting to %s", self.zkhosts) self.zh = KazooClient(hosts=self.zkhosts) self.zh.add_listener(self.__watcher) # this will wait until connection is established. self.zh.start() except ZookeeperError as ex: self.zh = None self.zkerror = ex def _initialize(self): if not self.readonly: if self.zstate is None: self.zh.ensure_path(self.NODE_SERVERS) self.zh.ensure_path(self.NODE_GJOBS) self.publish_alive() if not self.zh.exists(self.NODE_ME): self.zh.create(self.NODE_ME, "") if not self.zh.exists(self.NODE_MYJOBS): self.zh.acreate(self.NODE_MYJOBS) # setup notifications self.zh.get_children(self.NODE_SERVERS, self.__servers_watcher) self.zh.get_children(self.NODE_GJOBS, self.__jobs_watcher) def __watcher(self, state): # client level callback method. this method should not block. if state == KazooState.LOST: # session expiration self.zstate = state elif state == KazooState.SUSPENDED: # disconnected, session is still alive self.zstate = state else: # (re)connected self.LOGGER.debug("connected") self.zh.handler.spawn(self._initialize) self.zstate = state def get_status_text(self): return self.zkerror def create(self, path, data=""): # , perm=PERM_WORLD, flags=''): return self.zh.create(path, data) def acreate(self, path, data=""): # , perm=PERM_WORLD, flags=''): return self.zh.acreate(path, data) def exists(self, path): return self.zh.exists(path) def delete(self, path): try: return self.zh.delete(path) except NoNodeError as ex: pass def get_children(self, path, watch=None): return self.zh.get_children(path, watch=watch) def __servers_watcher(self, zh, evtype, state, path): """called when HQ servers are added / dropped.""" try: ch = self.get_children(self.NODE_SERVERS, watch=self.__servers_watcher) self.LOGGER.info("servers added/removed:%s", str(ch)) self.fire_event("serverschanged") except ZookeeperError as ex: self.LOGGER.warn("zk.get_children(%r) failed", self.NODE_SERVERS, exc_info=1) def __jobs_watcher(self, zh, evtype, state, path): """called when jobs are added / dropped.""" try: self.LOGGER.info("%s children changed", self.NODE_GJOBS) ch = self.get_children(self.NODE_GJOBS, watch=self.__jobs_watcher) self.fire_event("jobschanged") except ZooKeeperError as ex: self.LOGGER.warn("get_children(%r) failed", self.NODE_GJOBS, exc_info=1) def publish_alive(self): node_alive = self.NODE_ME + "/" + self.alivenode self.zh.create_async(node_alive, ephemeral=True) def publish_job(self, job): """job: hq.CrawlJob""" ju = self.jobs.get(job) # update 10 minutes interval if ju is None or ju < time.time() - 10 * 60: NODE_MYJOB = self.NODE_MYJOBS + "/" + job.jobname def set_complete(a): # print >>sys.stderr, "aset completed: %s" % str(args) if a.exception == NoNodeError: # node does not exist yet - create anew self.zh.create_async(NODE_MYJOB, "") try: a = self.zh.set_async(NODE_MYJOB, "") a.rawlink(set_complete) except: self.LOGGER.warn("aset failed", exc_info=1) pass node2 = self.NODE_GJOBS + "/" + job.jobname self.zh.create_async(node2) self.zh.create_async("{}/{}/{}".format(self.NODE_GJOBS, job.jobname, self.nodename), ephemeral=True) self.jobs[job] = time.time() def publish_client(self, job, client): pass def get_servers(self): return self.zh.get_children(self.NODE_SERVERS) def get_server_job(self, server, job): p = self.NODE_SERVERS + "/" + server + "/jobs/" + job j = dict() try: nodeval = self.zh.get(p) attr = nodeval[1] j["ts"] = attr.mtime / 1000.0 except NoNodeError as ex: j["ts"] = 0 return j def get_status_of(self, server=None, jobs=None): if self.zh is None: return None server = server or self.nodename status = dict(name=server) try: node = self.zh.get(self.NODE_SERVERS + "/" + server + "/alive") status["alive"] = node[1] except NoNodeError as ex: status["alive"] = False jobspath = self.NODE_SERVERS + "/" + server + "/jobs" if jobs is None: try: jobs = self.get_children(jobspath) except NoNodeError: jobs = [] elif isinstance(jobs, basestring): jobs = [jobs] status["jobs"] = [] for j in jobs: jobj = self.get_server_job(server, j) jobj["name"] = j status["jobs"].append(jobj) return status def get_servers_status(self): return [self.get_status_of(server) for server in self.get_servers()] def get_job_servers(self, jobname): """return a map of integer identifier to server name, which is configured for the job jobname. """ p = self.NODE_GJOBS + "/" + jobname + "/servers" try: svids = self.get_children(p) except NoNodeError as ex: return {} servers = {} for name in svids: try: svid = int(name) except: continue nodevals = self.zh.get(p + "/" + str(svid)) if nodevals[0]: servers[svid] = nodevals[0] return servers def get_job_servers2(self, jobname): """returns servers for job "jobname", including those active but not registered at jobs/JOBNAME/servers. elements of returned list are dict with svid and name keys. svid key only exists for those registered for the "jobname". """ servers = [dict(svid=svid, name=name) for svid, name in self.get_job_servers(jobname).items()] regservers = set(s["name"] for s in servers) try: p = self.NODE_SERVERS ss = self.get_children(self.NODE_SERVERS) for s in ss: if s in regservers: continue if not self.is_server_alive(s): continue p = self.NODE_SERVERS + "/" + s + "/jobs/" + jobname if self.exists(p): servers.append(dict(name=s)) except ZookeeperError as ex: self.LOGGER.debug("zookeeper access failed", exc_info=1) return servers def add_job_server(self, job, server): pass def delete_job_server(self, job, server): jobservers = dict((v, k) for k, v in self.get_job_servers(job).items()) if server in jobservers: p = self.NODE_GJOBS + "/" + job + "/servers/" + jobservers[server] self.delete(p) p = self.NODE_SERVERS + "/" + server + "/jobs/" + job # assumption: there's no child under the server/job node. try: self.delete(p) except NoNodeError as ex: pass except NotEmptyException as ex: # XXX pass def is_server_alive(self, server): p = self.NODE_SERVERS + "/" + server + "/alive" return self.exists(p) def add_listener(self, ev, listener): if not isinstance(ev, basestring): raise ValueError, "ev must be a string" ll = self.__listeners.get(ev) if not ll: self.__listeners[ev] = set((listener,)) else: ll.add(listener) def remove_listener(self, ev, listener): if not isinstance(ev, basestring): raise ValueError, "ev must be a string" ll = self.__listeners.get(ev) if ll: try: ll.remove(listener) except KeyError: pass def fire_event(self, ev, *args): ll = self.__listeners.get(ev) if ll: for listener in ll: try: listener(*args) except: self.LOGGER.warn("error running listener %r " "with ev=%r, args=%r", listener, ev, args, exc_info=1) def shutdown(self): if self.zh: self.zh.stop() self.zh = None
class ZkCoordinator(StaticCoordinator): @staticmethod def fromGroup(zkConnect: str, group: str) -> 'ZkCoordinator': """Convenience method for instantiation using conventional paths based on group. The path convention is: /static_assignment/[group]/assignments /static_assignment/[group]/members Args: zkConnect (str): Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). group (str): The name of the consumer group this coordinator belongs to. Must not be None. """ if group is None or len(group.strip()) == 0: raise ValueError('ZkCoordinator: Invalid `group` argument, it must not be None or blank.') prePath = f'/static_assignment/{group.strip()}' assignmentPath = f'{prePath}/assignment' membersPath = f'{prePath}/members' return ZkCoordinator(zkConnect, membersPath, assignmentPath) def __init__(self, zkConnect: str, membersPath: str, assignmentsPath: str): """Zookeeper implementation of `StaticCoordinator` Args: zkConnect (str): Comma-separated list of hosts to connect to (e.g. 127.0.0.1:2181,127.0.0.1:2182,[::1]:2183). membersPath (str): Zookeeper path at which members will create ephemeral nodes asserting their ID. assignmentsPath (str): Zookeeper path at which the current assignments are kept. """ for val, name in ((zkConnect, 'zkConnect'), (membersPath, 'membersPath'), (assignmentsPath, 'assignmentsPath')): if val is None or len(val.strip()) == 0: raise ValueError(f'ZkCoordinator: Invalid `{name}` argument, it must not be None or blank') logger.info('ZKCoordinator starting with, membersPath=%s, assignmentsPath=%s', membersPath, assignmentsPath) self._zkConnect = zkConnect self._membersPath = membersPath self._membersPathEnsured = False self._assignmentsPath = assignmentsPath self._assignmentsPathEnsured = False self._currentAssignment = None self._assignmentsWatcher = None self._memberMetaData: Optional[StaticMemberMeta] = None self.zk = KazooClient(hosts=zkConnect) self.zk.add_listener(self._zkListener()) self._memberId: Optional[MemberId] = None def _zkListener(self): def listener(state): if state == KazooState.LOST: self._memberId = None self._currentAssignment = None return listener def _establishSession(self): if self._assignmentsWatcher is None: # add watch for assignment updates def watchAssignments(data, stat, event): self._currentAssignment = self._processAssignmentsData(data) logger.info('Assignment update received. | assignments= %s', self._currentAssignment) self._ensureAssignmentsPath() self._assignmentWatcher = DataWatch(self.zk, self._assignmentsPath, watchAssignments) def _ensureAssignmentsPath(self): if not self._assignmentsPathEnsured: self.zk.ensure_path(self._assignmentsPath) self._assignmentsPathEnsured = True def _fetchAssignments(self) -> Optional[Assignments]: return self._currentAssignment def _processAssignmentsData(self, rawData): if rawData is not None: return Assignments.fromJson(rawData.decode('utf-8')) def _ensureMembersPath(self): if not self._membersPathEnsured: self.zk.ensure_path(self._membersPath) self._membersPathEnsured = True def _createPath(self, altMemberId: MemberId = None): mid = self._memberId if altMemberId is not None: mid = altMemberId if mid is not None: return f'{self._membersPath}/{mid}' return None def _encodeMemberData(self, meta: StaticMemberMeta): return ujson.dumps(meta.asDict()).encode('utf-8') def _compareAndUpdateMemberData(self, meta: StaticMemberMeta): newDict = None selfDict = None if self._memberMetaData is not None and meta is not None: selfDict = self._memberMetaData.asDict() newDict = meta.asDict() isDiff = ( selfDict['hostId'] != newDict['hostId'] or selfDict['assignment']['configVersion'] != newDict['assignment']['configVersion'] or selfDict['assignment']['version'] != newDict['assignment']['version'] ) else: isDiff = True if isDiff: self._memberMetaData = meta path = self._createPath() if path is not None: def cb(async_obj): try: async_obj.get() logger.info('Member meta data updated. | metaData=%s', meta) except (ConnectionLoss, SessionExpiredError): logger.exception('Failed to update member meta data.') self.zk.set_async(path, self._encodeMemberData(meta)).rawlink(cb) def updateAssignments(self, meta: StaticMemberMeta, newAssignments: Assignments): self.zk.retry(self._innerUpdateAssignment, newAssignments) def _innerUpdateAssignment(self, assignment: Assignments): self._ensureAssignmentsPath() self.zk.set(self._assignmentsPath, assignment.asJson().encode('utf-8')) logger.info('Assignments updated. | assignments=%s', assignment) def leave(self, meta: StaticMemberMeta): self.zk.retry(self._innerLeave) def _innerLeave(self): path = self._createPath() if path is not None: try: self.zk.delete(path) except (ConnectionLoss, SessionExpiredError): logger.exception( 'Failed to relinquish member ID, ' "will assume ephemeral node will expire on it's own. " '| memberId=%s', self._memberId, ) self._memberId = None def join(self, meta: StaticMemberMeta): asgns = self._fetchAssignments() if asgns is None: logger.warning('Cannot join a group without assignments. | assignmentsPath=%s', self._assignmentsPath) return None if self._memberId is None: self._memberId = self._inner_join(meta, asgns.maxMembers) return self._memberId def _inner_join(self, meta: StaticMemberMeta, maxMembers: int) -> Optional[MemberId]: idList = range(maxMembers) memberData = self._encodeMemberData(meta) self._ensureMembersPath() foundMid = None for mid in idList: memberIdPath = self._createPath(mid) try: self.zk.create(memberIdPath, memberData, ephemeral=True) foundMid = mid logging.debug('Member id acquired. | memberId=%s', mid) break except NodeExistsError: # move onto the next node logger.debug('Member id already taken moving to next. | memberId=%s', mid) except (ConnectionLoss, SessionExpiredError): logger.exception('Member id acquisition attempt failed with error.') time.sleep(1) self._memberMetaData = meta return foundMid def assignments(self, meta: StaticMemberMeta) -> Optional[Assignments]: self._compareAndUpdateMemberData(meta) return self._fetchAssignments() def heartbeat(self, meta: StaticMemberMeta) -> Optional[MemberId]: self._compareAndUpdateMemberData(meta) return self._memberId def stop(self): self.zk.stop() self.zk.close() def start(self): self.zk.start() self._establishSession()