def lock_release(self, name, lock_id, timeout=None, silent=False, thr=None): released = False if timeout is None: timeout = 5 deadline = time.time() + timeout with shared.LOCKS_LOCK: if not lock_id or shared.LOCKS.get(name, {}).get("id") != lock_id: return del shared.LOCKS[name] shared.wake_monitor(reason="unlock", immediate=True) if not silent: thr.log.info("released locally %s", name) while time.time() < deadline: if self._lock_released(name, lock_id): released = True break time.sleep(0.5) if released is False: thr.log.warning('timeout waiting for lock %s %s release on peers', name, lock_id)
def lock_release(self, name, lock_id, silent=False, thr=None): with shared.LOCKS_LOCK: if not lock_id or shared.LOCKS.get(name, {}).get("id") != lock_id: return del shared.LOCKS[name] shared.wake_monitor(reason="unlock", immediate=True) if not silent: thr.log.info("released %s", name)
def _lock_acquire(self, nodename, name): with shared.LOCKS_LOCK: if name in shared.LOCKS: return lock_id = str(uuid.uuid4()) shared.LOCKS[name] = { "requested": time.time(), "requester": nodename, "id": lock_id, } shared.wake_monitor(reason="lock", immediate=True) return lock_id
def action(self, nodename, thr=None, **kwargs): """ Care with locks """ thr.log_request("shutdown daemon", nodename, **kwargs) with shared.THREADS_LOCK: shared.THREADS["scheduler"].stop() mon = shared.THREADS["monitor"] if thr.stopped() or shared.NMON_DATA.status == "shutting": thr.log.info("already shutting") # wait for service shutdown to finish before releasing the dup client while True: if mon._shutdown: break time.sleep(0.3) return {"status": 0} try: thr.set_nmon("shutting") mon.kill_procs() for path in shared.SMON_DATA: _, _, kind = split_path(path) if kind not in ("svc", "vol"): continue thr.set_smon(path, local_expect="shutdown") self.wait_shutdown() # send a last status to peers so they can takeover asap mon.update_hb_data() mon._shutdown = True shared.wake_monitor("services shutdown done") except Exception as exc: thr.log.exception(exc) thr.log.info("services are now shutdown") while True: with shared.THREADS_LOCK: if not shared.THREADS["monitor"].is_alive(): break time.sleep(0.3) shared.DAEMON_STOP.set() return {"status": 0}
def _read_config(self): """ Reload the node configuration file and notify the threads to do the same, if the file's mtime has changed since the last load. """ mtime = self.get_config_mtime() if mtime is None: return if self.last_config_mtime is not None and \ self.last_config_mtime >= mtime: return try: with shared.NODE_LOCK: if shared.NODE: shared.NODE.close() shared.NODE = Node() shared.NODE.set_rlimit() shared.NODE.network_setup() unset_lazy(self, "config_hbs") if self.last_config_mtime: self.log.info("node config reloaded (changed)") else: self.log.info("node config loaded") self.last_config_mtime = mtime # signal the node config change to threads for thr in self.threads.values(): if thr.stopped(): thr.unstop() else: thr.notify_config_change() shared.wake_monitor(reason="config change", immediate=True) # signal the caller the config has changed return True except Exception as exc: self.log.warning("failed to load config: %s", str(exc))
def stop_threads(self): """ Send a stop notification to all threads, and wait for them to complete their shutdown. Stop dns last, so the service is available as long as possible. """ self.log.info("signal stop to all threads") for thr_id, thr in self.threads.items(): if thr_id == "dns": continue thr.stop() shared.wake_collector() shared.wake_scheduler() shared.wake_monitor(reason="stop threads", immediate=True) shared.wake_heartbeat_tx() for thr_id, thr in self.threads.items(): if thr_id == "dns": continue self.log.info("waiting for %s to stop", thr_id) thr.join() if "dns" in self.threads: self.threads["dns"].stop() self.log.info("waiting for dns to stop") self.threads["dns"].join()
def action(self, nodename, thr=None, **kwargs): options = self.parse_options(kwargs) if not options.thr_id: thr.log_request("stop daemon", nodename, **kwargs) if options.get("upgrade"): thr.set_nmon(status="upgrade") thr.log.info("announce upgrade state") else: thr.set_nmon(status="maintenance") thr.log.info("announce maintenance state") time.sleep(5) shared.DAEMON_STOP.set() return {"status": 0} elif options.thr_id == "tx": thr_ids = [thr_id for thr_id in shared.THREADS.keys() if thr_id.endswith("tx")] else: thr_ids = [options.thr_id] for thr_id in thr_ids: with shared.THREADS_LOCK: has_thr = thr_id in shared.THREADS if not has_thr: thr.log_request("stop thread requested on non-existing thread", nodename, **kwargs) return {"error": "thread does not exist"*50, "status": 1} thr.log_request("stop thread %s" % thr_id, nodename, **kwargs) with shared.THREADS_LOCK: shared.THREADS[thr_id].stop() if thr_id == "scheduler": shared.wake_scheduler() elif thr_id == "monitor": shared.wake_monitor("shutdown") elif thr_id.endswith("tx"): shared.wake_heartbeat_tx() if options.get("wait", False): with shared.THREADS_LOCK: shared.THREADS[thr_id].join() return {"status": 0}
def _store_rx_data(self, data, nodename): current_gen = shared.REMOTE_GEN.get(nodename, 0) our_gen_on_peer = data.get("gen", {}).get(Env.nodename, 0) kind = data.get("kind", "full") change = False if kind == "patch": if current_gen == 0: # waiting for a full: ignore patches return if nodename not in shared.CLUSTER_DATA: # happens during init. ignore the patch, and ask for a full shared.REMOTE_GEN[nodename] = 0 shared.LOCAL_GEN[nodename] = our_gen_on_peer return deltas = data.get("deltas", []) gens = sorted([int(gen) for gen in deltas]) gens = [gen for gen in gens if gen > current_gen] if len(gens) == 0: #self.log.info("no more recent gen in received deltas") if our_gen_on_peer > shared.LOCAL_GEN[nodename]: shared.LOCAL_GEN[nodename] = our_gen_on_peer shared.CLUSTER_DATA[nodename]["gen"][ Env.nodename] = our_gen_on_peer return with shared.CLUSTER_DATA_LOCK: for gen in gens: #self.log.debug("merge node %s gen %d (%d diffs)", nodename, gen, len(deltas[str(gen)])) if gen - 1 != current_gen: self.log.warning( "unsynchronized node %s dataset. local gen %d, received %d. " "ask for a full.", nodename, current_gen, gen) shared.REMOTE_GEN[nodename] = 0 shared.LOCAL_GEN[nodename] = our_gen_on_peer shared.CLUSTER_DATA[nodename]["gen"] = { nodename: gen, Env.nodename: our_gen_on_peer, } break try: json_delta.patch(shared.CLUSTER_DATA[nodename], deltas[str(gen)]) current_gen = gen shared.REMOTE_GEN[nodename] = gen shared.LOCAL_GEN[nodename] = our_gen_on_peer shared.CLUSTER_DATA[nodename]["gen"] = { nodename: gen, Env.nodename: our_gen_on_peer, } self.log.debug( "patch node %s dataset to gen %d, peer has gen %d of our dataset", nodename, shared.REMOTE_GEN[nodename], shared.LOCAL_GEN[nodename]) if self.patch_has_nodes_info_change(deltas[str(gen)]): self.on_nodes_info_change() change = True except Exception as exc: self.log.warning( "failed to apply node %s dataset gen %d patch: %s. " "ask for a full: %s", nodename, gen, deltas[str(gen)], exc) shared.REMOTE_GEN[nodename] = 0 shared.LOCAL_GEN[nodename] = our_gen_on_peer shared.CLUSTER_DATA[nodename]["gen"] = { nodename: gen, Env.nodename: our_gen_on_peer, } return elif kind == "ping": with shared.CLUSTER_DATA_LOCK: shared.REMOTE_GEN[nodename] = 0 shared.LOCAL_GEN[nodename] = our_gen_on_peer if nodename not in shared.CLUSTER_DATA: shared.CLUSTER_DATA[nodename] = {} shared.CLUSTER_DATA[nodename]["gen"] = { nodename: 0, Env.nodename: our_gen_on_peer, } shared.CLUSTER_DATA[nodename]["monitor"] = data["monitor"] self.log.debug( "reset node %s dataset gen, peer has gen %d of our dataset", nodename, shared.LOCAL_GEN[nodename]) change = True else: data_gen = data.get("gen", {}).get(nodename) if data_gen is None: self.log.debug("no 'gen' in full dataset from %s: drop", nodename) return last_gen = shared.REMOTE_GEN.get(nodename) if last_gen is not None and last_gen >= data_gen: self.log.debug( "already installed or beyond %s gen %d dataset: drop", nodename, data_gen) return node_status = data.get("monitor", {}).get("status") if node_status in ("init", "maintenance", "upgrade") and nodename in shared.CLUSTER_DATA: for path, idata in shared.CLUSTER_DATA[nodename].get( "services", {}).get("status", {}).items(): if path in data["services"]["status"]: continue idata["preserved"] = True data["services"]["status"][path] = idata with shared.CLUSTER_DATA_LOCK: shared.CLUSTER_DATA[nodename] = data new_gen = data.get("gen", {}).get(nodename, 0) shared.LOCAL_GEN[nodename] = our_gen_on_peer self.on_nodes_info_change() shared.REMOTE_GEN[nodename] = new_gen shared.CLUSTER_DATA[nodename]["gen"] = { nodename: new_gen, Env.nodename: our_gen_on_peer, } self.log.debug( "install node %s dataset gen %d, peer has gen %d of our dataset", nodename, shared.REMOTE_GEN[nodename], shared.LOCAL_GEN[nodename]) change = True if change: shared.wake_monitor( "node %s %s dataset gen %d received through %s" % (nodename, kind, shared.REMOTE_GEN[nodename], self.name))