def _get_registered_nodes(zk: KazooClient, zk_path: str) -> List[str]: """ Return the IPs of nodes that have registered in ZooKeeper. The ZNode `zk_path` is expected to exist, having been created during cluster bootstrap. Args: zk: The client to use to communicate with ZooKeeper. zk_path: The path of the ZNode to use for node registration. Returns: A list of internal IP addresses of nodes that have previously joined the CockroachDB cluster. """ # We call `sync()` before reading the value in order to # read the latest data written to ZooKeeper. # See https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#ch_zkGuarantees log.info("Calling sync() on ZNode `{}`".format(zk_path)) zk.sync(zk_path) log.info("Loading data from ZNode `{}`".format(zk_path)) data, _ = zk.get(zk_path) if data: log.info("Cluster was previously initialized.") nodes = json.loads(data.decode('ascii'))['nodes'] # type: List[str] log.info("Found registered nodes: {}".format(nodes)) return nodes log.info("Found no registered nodes.") return []
def _register_cluster_membership(zk: KazooClient, zk_path: str, ip: str) -> List[str]: """ Add `ip` to the list of cluster members registered in ZooKeeper. The ZK lock must be held around the call to this function. Args: zk: The client to use to communicate with ZooKeeper. zk_path: The path of the ZNode to use for node registration. ip: The ip to add to the list of cluster member IPs in ZooKeeper. """ log.info("Registering cluster membership for `{}`".format(ip)) # Get the latest list of cluster members. nodes = _get_registered_nodes(zk=zk, zk_path=zk_path) if ip in nodes: # We're already registered with ZK. log.info( "Cluster member `{}` already registered in ZooKeeper. Skipping.". format(ip)) return nodes log.info("Adding `{}` to list of nodes `{}`".format(ip, nodes)) nodes.append(ip) zk.set(zk_path, json.dumps({"nodes": nodes}).encode("ascii")) zk.sync(zk_path) log.info("Successfully registered cluster membership for `{}`".format(ip)) return nodes
def _register_cluster_membership(zk: KazooClient, zk_path: str, ip: str) -> List[str]: """ Add `ip` to the list of cluster members registered in ZooKeeper. The ZK lock must be held around the call to this function. Args: zk: The client to use to communicate with ZooKeeper. zk_path: The path of the ZNode to use for node registration. ip: The ip to add to the list of cluster member IPs in ZooKeeper. """ log.info("Registering cluster membership for `{}`".format(ip)) # Get the latest list of cluster members. nodes = _get_registered_nodes(zk=zk, zk_path=zk_path) if ip in nodes: # We're already registered with ZK. log.info("Cluster member `{}` already registered in ZooKeeper. Skipping.".format(ip)) return nodes log.info("Adding `{}` to list of nodes `{}`".format(ip, nodes)) nodes.append(ip) zk.set(zk_path, json.dumps({"nodes": nodes}).encode("ascii")) zk.sync(zk_path) log.info("Successfully registered cluster membership for `{}`".format(ip)) return nodes
def remove_cluster_membership(zk: KazooClient, zk_path: str, ip: str) -> List[str]: """ Remove `ip` from the list of cluster members registered in ZooKeeper. The ZK lock must be held around the call to this function. Args: zk: The client to use to communicate with ZooKeeper. zk_path: The path of the ZNode to use for node registration. ip: The ip to add to the list of cluster member IPs in ZooKeeper. """ log.info("Removing cluster membership for `%s`", ip) # Get the latest list of cluster members. nodes = get_registered_nodes(zk=zk, zk_path=zk_path) if ip not in nodes: # We're already registered with ZK. log.info( "Cluster member `%s` already removed from Zookeeper. Skipping.", ip) return nodes log.info("Removing `%s` to list of nodes `%s`", ip, nodes) nodes.remove(ip) zk.set(zk_path, json.dumps({"nodes": nodes}).encode("ascii")) zk.sync(zk_path) log.info("Successfully removed %s from the cluster", ip) return nodes
def _get_registered_nodes(zk: KazooClient, zk_path: str) -> List[str]: """ Return the IPs of nodes that have registered in ZooKeeper. The ZNode `zk_path` is expected to exist, having been created during cluster bootstrap. Args: zk: The client to use to communicate with ZooKeeper. zk_path: The path of the ZNode to use for node registration. Returns: A list of internal IP addresses of nodes that have previously joined the CockroachDB cluster. """ # We call `sync()` before reading the value in order to # read the latest data written to ZooKeeper. # See https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#ch_zkGuarantees log.info("Calling sync() on ZNode `{}`".format(zk_path)) zk.sync(zk_path) log.info("Loading data from ZNode `{}`".format(zk_path)) data, _ = zk.get(zk_path) if data: log.info("Cluster was previously initialized.") nodes = json.loads(data.decode('ascii'))['nodes'] # type: List[str] log.info("Found registered nodes: {}".format(nodes)) return nodes log.info("Found no registered nodes.") return []
class StoreServer(object): """Distributed store server""" def __init__(self, addr, zookeeper_config='zookeeper_config', store_config='store_config'): self.synchronizing = True self.addr = addr self.zookeeper_config = zookeeper_config self.store_config = store_config def start(self): """Start store server""" self.connect_zookeeper() def get_max_id_db(self): """Get the maximum id in the database""" print "get_max_id_db" try: # c = self.get_db() cur.execute("SELECT MAX(id) FROM store ORDER BY id DESC LIMIT 1") result = cur.fetchone() if result[0] != None: print "max id: %s" % int(result[0]) return int(result[0]) else: print "max id: 0" return 0 except Exception: raise StoreException('SQLite') def write_db(self, key, value): """Write key-value pair to database""" print "write_db: key: %s, value: %s" % (str(key), str(value)) try: cur.execute("INSERT INTO store (key, value) VALUES('" + str(key) + "', '" + str(value) + "')") conn.commit() print "put successful" return True except Exception: raise StoreException('SQLite') def get_snapshot(self): """Get a snapshot of the current database""" print "get_snapshot" all_pairs = [] try: cur.execute("SELECT key, value, id FROM store ORDER BY id ASC") results = cur.fetchall() for result in results: if len(result) == 3: all_pairs.append([str(result[0]), str(result[1]), int(result[2])]) print "pairs: %s" % str(all_pairs) return tuple(all_pairs) except Exception: raise StoreException('SQLite') def write_snapshot(self, snapshot): """Replace current database with given database snapshot""" print "write_snapshot" try: cur.execute("DROP TABLE IF EXISTS store") cur.execute("CREATE TABLE IF NOT EXISTS store (key TEXT, value TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)") cur.executemany("INSERT INTO store VALUES(?, ?, ?)", snapshot) conn.commit() except Exception: raise StoreException('SQLite') def connect_zookeeper(self): """Connect to ZooKeeper and start leader election""" self.stores = [] f = open(self.store_config, 'r') for line in f.readlines(): line = line.rstrip() self.stores.append(line) print 'My addr: %s' % (self.addr) print 'Store list: %s' % (str(self.stores)) self.n = len(self.stores) self.connections = [] self.servers = [] for i, server in enumerate(self.stores): if server == self.addr: self.i = i else: try: c = zerorpc.Client(timeout=3) c.connect('tcp://' + server) self.connections.append(c) self.servers.append(server) print "Create connection with: %s" % server except zerorpc.TimeoutExpired: print '(connection) Timeout! %s' % server self.zookeepers = '' with open ('zookeeper_config', 'r') as zk_config: self.zookeepers=zk_config.read().replace('\n', ',') # self.zk = KazooClient(hosts=self.zookeepers) self.zk = KazooClient(timeout=3) self.zk.start() # Start election self.full_path = self.zk.create('/ELECTION/' + self.addr + '_', ephemeral=True, sequence=True, makepath=True) # Remvove '/ELECTION' from path self.short_path = self.full_path[10:] # Remove address from path self.path_id = self.short_path[self.short_path.find('_') + 1:] print "path_id: %s" % self.path_id self.zk.sync("/ELECTION") children = self.zk.get_children("/ELECTION") sorted_children = sorted([child[child.find('_') + 1:] for child in children]) self.sorted_i = sorted_children.index(self.path_id) leader_id = sorted_children[0] # Find leader address for i, child in enumerate(children): if leader_id in child: leader = child # Check if leader if leader == self.short_path: self.leader_function() else: # Find leader address for i, child in enumerate(children): if sorted_children[self.sorted_i - 1] in child: watch_server = child self.zk.exists("/ELECTION/" + watch_server, watch=self.watch_znode) print "Set watch on: %s" % "/ELECTION/" + watch_server # Synchronize with leader leader_addr = str(leader[:leader.find('_')]) try: print "leader_addr: %s" % leader_addr leader_index = self.servers.index(leader_addr) print "leader_index: %s" % leader_index print "Calling synchronize on leader..." self.connections[leader_index].synchronize() except Exception: print "Leader election in progress." def get_online_list(self, children): """Get list of online store servers""" online_list = [] for child in children: child = str(child) server = child[:child.find('_')] if server != self.addr: online_list.append(server) print "online list: %s" % online_list return online_list def leader_function(self): """Function that is run by newly elected leader""" print "I am the leader!" self.synchronize() def watch_znode(self, event): """Watch the next znode down in the sequence of znodes (avoids the herd effect)""" print "znode watch triggered" self.zk.sync("/ELECTION") children = self.zk.get_children("/ELECTION") sorted_children = sorted([child[child.find('_') + 1:] for child in children]) self.sorted_i = sorted_children.index(self.path_id) leader_id = sorted_children[0] # Find leader address for i, child in enumerate(children): if leader_id in child: leader = child # Check if leader if leader == self.short_path: self.leader_function() else: # Find leader address for i, child in enumerate(children): if sorted_children[self.sorted_i - 1] in child: watch_server = child self.zk.exists("/ELECTION/" + watch_server, watch=self.watch_znode) print "Set watch on: %s" % "/ELECTION/" + watch_server def get(self, key, synchronizing=False): """Get most recent value for the given key in the database""" print "get: %s" % key while self.synchronizing is True and synchronizing is False: print "Waiting on synchronization" time.sleep(1) try: cur.execute("SELECT key, value, MAX(id) FROM store WHERE KEY='" + key + "' ORDER BY id DESC LIMIT 1") result = cur.fetchone() if len(result) == 3: print "get result: %s" % result[1] return str(result[1]) else: raise StoreException('SQLite') except Exception: raise StoreException('SQLite') def get_all(self, synchronizing=False): """Get the most recent key-value pair for each key in the database""" print "get_all" while self.synchronizing is True and synchronizing is False: print "Waiting on synchronization" time.sleep(1) all_pairs = [] try: cur.execute("SELECT DISTINCT key, value, id FROM store GROUP BY key ORDER BY key, id DESC") results = cur.fetchall() for result in results: if len(result) == 3: all_pairs.append({ 'key': str(result[0]), 'value': str(result[1]), 'id': int(result[2]) }) # Print key-values pairs (for debugging) print "pairs: %s" % str(all_pairs) return json.dumps(all_pairs) except Exception: raise StoreException('SQLite') def put(self, key, value, synchronizing=False): """Write a key-value pair to the database using Paxos to ensure consistency across the servers""" print "put: %s, %s" % (key, value) while self.synchronizing is True and synchronizing is False: print "Waiting on synchronization" time.sleep(1) # Get most recent ID for key try: cur.execute("SELECT MAX(id) FROM store WHERE KEY='" + key + "' ORDER BY id DESC LIMIT 1") result = cur.fetchone() if result[0] != None: id = str(result[0]) else: id = str(0) print "most recent version: %s" % result[0] id = str(int(id) + 1) except Exception: raise StoreException('SQLite') # Begin two-phase commit self.zk.sync("/ELECTION") children = self.zk.get_children("/ELECTION") if len(children) < 2: raise StoreException('No replicas available.') try: # Remove znode if it exists self.zk.delete('/PUT/' + key + "/" + id, recursive=True) except Exception: pass put_path = self.zk.ensure_path('/PUT/' + key + "/" + id) online_list = self.get_online_list(children) accepted_list = [] for server in online_list: try: index = self.servers.index(server) except Exception: c = zerorpc.Client(timeout=3) c.connect('tcp://' + server) self.connections.append(c) self.servers.append(server) index = self.servers.index(server) try: accepted_list.append(self.connections[index].replicate_accept_put(put_path, key, id)) except Exception: continue accepted_true = 0 accepted_false = 0 for response in accepted_list: if response is True: accepted_true = accepted_true + 1 else: accepted_false = accepted_false + 1 if accepted_false > 0: # Data on replicas is not consistent self.synchronize() self.put(key, value) elif accepted_true > 0: # Quorum accepted print "Quorum accepted. Setting value to %s" % str(value) for server in online_list: try: index = self.servers.index(server) except Exception: c = zerorpc.Client(timeout=3) c.connect('tcp://' + server) self.connections.append(c) self.servers.append(server) index = self.servers.index(server) try: self.connections[index].replicate_write_put(put_path, key, value) except Exception: continue # Write to db return self.write_db(key, value) else: raise StoreException('No replicas available.') def synchronize(self): """Ensure consistency across the servers by sending database snapshots to outdated servers""" print "synchronize" self.synchronizing = True self.zk.sync("/ELECTION") children = self.zk.get_children("/ELECTION") online_list = self.get_online_list(children) max_id = self.get_max_id_db() for server in online_list: try: index = self.servers.index(server) except Exception: c = zerorpc.Client(timeout=3) c.connect('tcp://' + server) self.connections.append(c) self.servers.append(server) index = self.servers.index(server) try: server_max_id = self.connections[index].get_max_id_db() print "max_id: %s" % max_id print "%s max_id: %s" % (server, server_max_id) if server_max_id == max_id: continue elif server_max_id > max_id: snapshot = self.connections[index].get_snapshot() print "replica snapshot: %s" % str(snapshot) self.write_snapshot(snapshot) else: snapshot = self.get_snapshot() print "my snapshot: %s" % str(snapshot) self.connections[index].write_snapshot(snapshot) except Exception: print "Timeout! Skipping %s, must be offline." % server continue self.synchronizing = False def replicate_accept_put(self, put_path, key, id): """Look up the most recent database ID for the given key and send an accept/deny response to the leader""" print "replicate_accept_put: %s, key: %s, id: %s" % (put_path, key, id) try: # Get most recent ID for key cur.execute("SELECT MAX(id) FROM store WHERE KEY='" + key + "' ORDER BY id DESC LIMIT 1") result = cur.fetchone() if result[0] != None: max_id = int(result[0]) else: max_id = 0 print "most recent version: %s" % result[0] except Exception: raise StoreException('SQLite') if int(id) == max_id + 1: return True else: # Newer version of key found on replica return False def replicate_write_put(self, put_path, key, value): """Write the given key to the database""" print "replicate_write_put: %s" % put_path # Write to db self.write_db(key, value) # Print current database (for debugging) self.get_all(synchronizing=True) print "replicate_write_put: key: %s, value: %s" % (key, value)
class Queue(object): ZK_TREE_ROOT = 'bzmzq' ZK_QUEUE_LOCK_NAME = 'main-lock' def __init__(self, zk_servers, queue_name): self._queue_name = queue_name self.kz_ses = KazooClient(zk_servers) self.kz_ses.start() self.servers = zk_servers self._kz_queue = self.kz_ses.LockingQueue( str(self.path_factory.queue.kz_queue())) self._tlock = TLock() self._rlock_cache = WeakValueDictionary() # {<rlock_name>: RLock} self._make_paths() @cached_prop def path_factory(self): return PathFactory(self) @cached_prop def queue_name(self): return self._queue_name def _make_paths(self): for root_path in self.path_factory.get_path_roots(): root_path = str(root_path) self.kz_ses.ensure_path(str(root_path)) self.kz_ses.sync(root_path) for state_id in list(JobStates().values()): state_path = str(self.path_factory.job_state.id(state_id)) self.kz_ses.ensure_path(str(state_path)) self.kz_ses.sync(state_path) for state_id in list(ScheduledJobStates().values()): state_path = str( self.path_factory.scheduled_job_state.id(state_id)) self.kz_ses.ensure_path(str(state_path)) self.kz_ses.sync(state_path) def get_lock(self, lock_name=None): with self._tlock: lock_name = lock_name if lock_name else self.ZK_QUEUE_LOCK_NAME cached_lock = self._rlock_cache.get(lock_name) if cached_lock: return cached_lock lock_path = self.path_factory.lock.name(lock_name) new_lock = self.kz_ses.Lock(str(lock_path)) self._rlock_cache[lock_name] = new_lock return new_lock def create_job(self, *args, **kwargs): return Job.create(self, *args, **kwargs) def create_scheduled_job(self, *args, **kwargs): return ScheduledJob.create(self, *args, **kwargs) def get_jobs(self, state=None): if state is None: path = str(self.path_factory.job.root()) else: if state not in list(JobStates().values()): raise ValueError("Unknown job state") path = str(self.path_factory.job_state.id(state)) return [Job(self, job_id) for job_id in self.kz_ses.get_children(path)] def get_scheduled_jobs(self, state=None): if state is None: path = str(self.path_factory.scheduled_job.root()) else: if state not in list(ScheduledJobStates().values()): raise ValueError("Unknown scheduled job state") path = str(self.path_factory.scheduled_job_state.id(state)) return [ ScheduledJob(self, scheduled_job_id) for scheduled_job_id in self.kz_ses.get_children(path) ]
class Store(object): """API for interacting with distributed store servers""" def __init__(self, zookeepers=''): # Connect to ZooKeeper # self.zk = KazooClient(hosts=zookeepers) self.zk = KazooClient() self.zk.start() def get(self, key): """Get key from store""" retries = 0 while retries <= 3: leader = self.get_leader() try: c = zerorpc.Client(timeout=10) c.connect('tcp://' + leader) return c.get(key) except zerorpc.TimeoutExpired: print "get timeout" retries = retries + 1 time.sleep(retries - 1) raise StoreException('Timeout') def get_all(self): """Get all keys from store""" retries = 0 while retries <= 3: print "get_all (%s)" % retries leader = self.get_leader() print "leader: %s" % leader try: c = zerorpc.Client(timeout=10) c.connect('tcp://' + leader) result = c.get_all() return result except zerorpc.TimeoutExpired: print "get_all timeout" retries = retries + 1 time.sleep(5) raise StoreException('Timeout') def put(self, key, value): """Write a key-value pair to the store""" retries = 0 while retries <= 3: leader = self.get_leader() try: c = zerorpc.Client(timeout=20) c.connect('tcp://' + leader) return c.put(key, value) except zerorpc.TimeoutExpired: print "put timeout" retries = retries + 1 time.sleep(retries - 1) raise StoreException('Timeout') def get_leader(self): """Get the current leader store server""" self.zk.sync("/ELECTION") children = self.zk.get_children("/ELECTION") print "children: %s" % children if len(children) > 0: sorted_children = sorted([child[child.find('_') + 1:] for child in children]) leader_id = sorted_children[0] # Find leader address for i, child in enumerate(children): if leader_id in child: return child[:child.find('_')] else: return '' def get_online_list(self): """Get the current list of online store servers""" self.zk.sync("/ELECTION") children = self.zk.get_children("/ELECTION") return sorted([str(child[:child.find('_')]) for child in children])
class Backend: def __init__(self): self._logger = logging.getLogger('gunicorn.error') self._zk = KazooClient(hosts=f'{os.getenv("ZOOKEEPER_HOST")}:2181') self._hdfsClient = HdfsClient(os.getenv("HADOOP_NAMENODE_HOST")) self._active = False self._target_id = None self._zk_node_path = None self._range = None self._trie = None scheduler = BackgroundScheduler(timezone="UTC") scheduler.add_job(self._attempt_to_join_any, 'interval', minutes=1) scheduler.start() def start(self): self._zk.start() datawatch_next_target = DataWatch(client=self._zk, path=ZK_NEXT_TARGET, func=self._on_next_target_changed) datawatch_current_target = DataWatch( client=self._zk, path=ZK_CURRENT_TARGET, func=self._on_current_target_changed) def stop(self): self._zk.stop() def top_phrases_for_prefix(self, prefix): if (not self._active): raise NodeInactiveError( "This backend node is not active. Consult zookeeper for the most recent active nodes" ) return self._trie.top_phrases_for_prefix(prefix) def _on_next_target_changed(self, data, stat, event=None): self._logger.info("_on_next_target_changed Data is %s" % data) if (data is None): return current_target_id = self._zk.get(ZK_CURRENT_TARGET)[0].decode() next_target_id = data.decode() self._deactivate_if_not_used(current_target_id, next_target_id) success = self._attempt_to_join_target(next_target_id) def _on_current_target_changed(self, data, stat): self._logger.info("_on_current_target_changed Data is %s" % data) if (data is None): return current_target_id = data.decode() next_target_id = self._zk.get(ZK_NEXT_TARGET)[0].decode() self._deactivate_if_not_used(current_target_id, next_target_id) def _deactivate_if_not_used(self, current_target_id, next_target_id): if (self._active and self._target_id and current_target_id != self._target_id and next_target_id != self._target_id): self._logger.info( f'Deactivating {self._target_id}, {current_target_id}, {next_target_id}' ) if (self._zk.exists(self._zk_node_path)): self._zk.delete(self._zk_node_path) self._active = False self._target_id = None self._trie = None self._zk_node_path = None def _attempt_to_join_any(self): self._logger.debug("Attempting to join any") if (self._active): return if (self._zk.exists(ZK_CURRENT_TARGET) is not None): target_id = self._zk.get(ZK_CURRENT_TARGET)[0].decode() if (self._attempt_to_join_target(target_id)): return if (self._zk.exists(ZK_NEXT_TARGET) is not None): target_id = self._zk.get(ZK_NEXT_TARGET)[0].decode() if (self._attempt_to_join_target(target_id)): return def _attempt_to_join_target(self, target_id): if (self._active): return False if (not target_id or self._zk.exists(f'/phrases/distributor/{target_id}') is None): return False self._logger.info(f'Attempting to join {target_id}') partitions = self._zk.get_children( f'/phrases/distributor/{target_id}/partitions') for partition in partitions: nodes_path = f'/phrases/distributor/{target_id}/partitions/{partition}/nodes/' self._zk.sync(nodes_path) nodes = self._zk.get_children(nodes_path) if (len(nodes) >= NUMBER_NODES_PER_PARTITION): self._logger.info( f'Cannot join {nodes_path}, it has enough nodes {nodes}; current number of nodes {len(nodes)} >= {NUMBER_NODES_PER_PARTITION}' ) continue # No more nodes needed here created_node_path = self._zk.create(nodes_path, value=b'', ephemeral=True, sequence=True) self._zk_node_path = created_node_path try: created_node_name = created_node_path.split('/')[-1] self._zk.sync(nodes_path) nodes = self._zk.get_children(nodes_path) nodes.sort() if (nodes and int(created_node_name) > int( nodes[0:NUMBER_NODES_PER_PARTITION][-1])): # The node was not able to join the partition self._logger.info( f'Cannot join {nodes_path}, it has already filled up {nodes}; created_node_name: {created_node_name}' ) self._zk.delete(created_node_path) continue if (not self._load_trie_and_activate(target_id, partition)): self._logger.error( f'Error while loading and activating trie for {nodes_path}, target_id: {target_id}, partition: {partition}' ) self._zk.delete(created_node_path) continue # Finishes the initialization self._zk.set(created_node_path, socket.gethostname().encode()) return True except: self._zk.delete(created_node_path) return False def _load_trie_and_activate(self, target_id, partition): trie_data_hdfs_path = f'/phrases/distributor/{target_id}/partitions/{partition}/trie_data_hdfs_path' trie = self._load_trie(self._zk.get(trie_data_hdfs_path)[0].decode()) if (trie): self._trie = trie self._active = True self._target_id = target_id self._logger.info( f'Now ACTIVE and loaded trie for partition {partition} and target_id {target_id}' ) return True else: return False def _load_trie(self, trie_hdfs_path): local_path = 'trie.dat' self._hdfsClient.download(trie_hdfs_path, local_path) with open(local_path, 'rb') as f: return pickle.load(f)