Beispiel #1
0
    def wait_dns_records(self):
        if not self.wait_dns:
            return

        # refresh the ipaddr advertized in status.json
        self.status_info()
        self.write_status_last()
        self.svc.print_status_data_eval()

        left = self.wait_dns
        time_max = self._current_time() + left
        self.log.info("wait address propagation to peers (wait_dns=%s)", print_duration(left))
        path = ".monitor.nodes.'%s'.services.status.'%s'.resources.'%s'.info.ipaddr~[0-9]" % (Env.nodename, self.svc.path, self.rid)
        try:
            result = self.svc.node._wait(path=path, duration=left)
        except KeyboardInterrupt:
            raise ex.Error("dns resolution not ready after %s (ip not in local dataset)" % print_duration(self.wait_dns))
        left = time_max - self._current_time()
        self.log.info("wait cluster sync (time left is %s)", print_duration(left))
        while left > 0:
            result = self.svc.node.daemon_get({
                "action": "sync",
                "options": {
                    "timeout": left
                },
            }, timeout=left+10)
            if result["status"] == 0:
                return
            wait_dns_records_delay_func(0.3)  # avoid fast-looping the listener
            left = time_max - self._current_time()
        raise ex.Error("dns resolution not ready after %s (cluster sync timeout)" % print_duration(self.wait_dns))
Beispiel #2
0
 def wait_dns_records(self):
     if not self.wait_dns:
         return
     left = self.wait_dns
     time_max = self._current_time() + left
     self.svc.print_status_data_eval()
     self.log.info("wait address propagation to peers (wait_dns=%s)",
                   print_duration(left))
     path = ".monitor.nodes.'%s'.services.status.'%s'.resources.'%s'.info.ipaddr~[0-9]" % (
         Env.nodename, self.svc.path, self.rid)
     try:
         result = self.svc.node._wait(path=path, duration=left)
     except KeyboardInterrupt:
         raise ex.Error(
             "dns resolution not ready after %s (ip not in local dataset)" %
             print_duration(self.wait_dns))
     left = time_max - self._current_time()
     self.log.info("wait cluster sync (time left is %s)",
                   print_duration(left))
     while left > 0:
         result = self.svc.node.daemon_get({"action": "sync"}, timeout=left)
         if result["status"] == 0:
             return
         left = time_max - self._current_time()
     raise ex.Error(
         "dns resolution not ready after %s (cluster sync timeout)" %
         print_duration(self.wait_dns))
Beispiel #3
0
 def queue_action(self, action, delay=0, path=None, rid=None, now=None):
     sig = (action, path, rid)
     if delay is None:
         delay = 0
     if sig in self.running:
         self.log.debug("skip already running action '%s'", sig)
         return
     if sig in self.delayed:
         self.promote_queued_action(sig, delay, now)
         return
     exp = now + delay
     self.delayed[sig] = {
         "queued": self.now,
         "expire": exp,
         "delay": delay,
     }
     if not delay:
         self.log.debug("queued action '%s' for run in %s", sig, print_duration(exp-self.now))
     else:
         self.log.debug("queued action '%s' for run in %s + %s delay", sig, print_duration(exp-self.now), print_duration(delay))
     return
Beispiel #4
0
 def dequeue_actions(self):
     """
     Get merged tasks to run from get_todo(), execute them and purge the
     delayed hash.
     """
     dequeued = []
     for task in self.get_todo():
         cmd = self.format_cmd(task["action"], task["path"], task["rids"])
         log_cmd = self.format_log_cmd(task["action"], task["path"], task["rids"])
         self.log.info("run '%s' queued %s ago", " ".join(log_cmd), print_duration(self.now - task["queued"]))
         self.exec_action(task["sigs"], cmd)
         dequeued += task["sigs"]
     self.delete_queued(dequeued)
Beispiel #5
0
def fmt_cpu_time(get, stats_data):
    if stats_data is None:
        return ""
    time = 0
    for _data in stats_data.values():
        try:
            time += get(_data)
        except (KeyError, TypeError) as exc:
            pass
    try:
        return print_duration(time)
    except Exception:
        return ""
Beispiel #6
0
 def _status(self, verbose=False):
     if self.last is None:
         return core.status.DOWN
     if len(self.active_pairs) not in (len(self.pairs), 0):
         self.status_log("cloneset has %d/%d active devs" %
                         (len(self.active_pairs), len(self.pairs)))
         return core.status.WARN
     elif self.last < datetime.datetime.now() - datetime.timedelta(
             seconds=self.sync_max_delay):
         self.status_log("Last sync on %s older than %s" %
                         (self.last, print_duration(self.sync_max_delay)))
         return core.status.WARN
     else:
         self.status_log("Last sync on %s" % self.last, "info")
         return core.status.UP
Beispiel #7
0
def fmt_svc_uptime(key, stats_data):
    if stats_data is None:
        return ""
    total = 0
    now = time.time()
    top = 0
    for node, _data in stats_data.items():
        try:
            uptime = now - _data["services"][key]["created"]
            if uptime > top:
                top = uptime
        except (TypeError, KeyError) as exc:
            pass
    try:
        return print_duration(top)
    except Exception:
        return ""
Beispiel #8
0
 def can_sync(self, target=None):
     try:
         ls = self.get_local_state()
         ts = datetime.datetime.strptime(ls['date'], "%Y-%m-%d %H:%M:%S.%f")
     except IOError:
         self.log.error("btrfs state file not found")
         return True
     except:
         import sys
         import traceback
         e = sys.exc_info()
         print(e[0], e[1], traceback.print_tb(e[2]))
         return False
     if self.skip_sync(ts):
         self.status_log("Last sync on %s older than %s" %
                         (ts, print_duration(self.sync_max_delay)))
         return False
     return True
Beispiel #9
0
 def sync_status(self, verbose=False):
     try:
         ls = self.get_local_state()
         now = datetime.datetime.now()
         last = datetime.datetime.strptime(ls['date'],
                                           "%Y-%m-%d %H:%M:%S.%f")
         delay = datetime.timedelta(seconds=self.sync_max_delay)
     except IOError:
         self.status_log("zfs state file not found")
         return core.status.WARN
     except:
         import sys
         import traceback
         e = sys.exc_info()
         print(e[0], e[1], traceback.print_tb(e[2]))
         return core.status.WARN
     if last < now - delay:
         self.status_log("Last sync on %s older than %s" %
                         (last, print_duration(self.sync_max_delay)))
         return core.status.WARN
     return core.status.UP
Beispiel #10
0
 def janitor_certificates(self):
     if self.now < self.last_janitor_certs + JANITOR_CERTS_INTERVAL:
         return
     if self.first_available_node() != Env.nodename:
         return
     self.last_janitor_certs = time.time()
     for path in [p for p in shared.SERVICES]:
         try:
             obj = shared.SERVICES[path]
         except KeyError:
             continue
         if obj.kind not in ("sec", "usr"):
             continue
         try:
             ca = obj.oget("DEFAULT", "ca")
         except Exception as exc:
             continue
         if ca != self.cluster_ca:
             continue
         cf_mtime = shared.CLUSTER_DATA.get(Env.nodename, {}).get("services", {}).get("config", {}).get(obj.path, {}).get("updated")
         if cf_mtime is None:
             continue
         if obj.path not in self.certificates or self.certificates[obj.path]["mtime"] < cf_mtime:
             try:
                 expire = obj.get_cert_expire()
             except ex.Error:
                 # usr in creation
                 expire = None
             self.certificates[obj.path] = {
                 "mtime": cf_mtime,
                 "expire": expire,
             }
         expire = self.certificates[obj.path]["expire"]
         if not expire:
             continue
         expire_delay = expire - self.now
         #print(obj.path, "expire in:", print_duration(expire_delay))
         if expire_delay < 3600:
             self.log.info("renew %s certificate, expiring in %s", obj.path, print_duration(expire_delay))
             obj.gen_cert()
Beispiel #11
0
    def docker(self, action):
        """
        Wrap docker commands to honor <action>.
        """
        if self.lib.docker_cmd is None:
            raise ex.Error("docker executable not found")
        sec_env = {}
        cfg_env = {}
        cmd = self.lib.docker_cmd + []
        if action == "start":
            if not self.detach and self.start_timeout is not None:
                signal.signal(signal.SIGALRM, alarm_handler)
                signal.alarm(self.start_timeout)
            if self.rm:
                self.container_rm()
            if self.container_id is None:
                self.is_up_clear_cache()
            if self.container_id is None:
                try:
                    image_id = self.lib.get_image_id(self.image)
                except ValueError as exc:
                    raise ex.Error(str(exc))
                if image_id is None:
                    self.lib.docker_login(self.image)
                sec_env = self.kind_environment_env("sec",
                                                    self.secrets_environment)
                cfg_env = self.kind_environment_env("cfg",
                                                    self.configs_environment)
                cmd += ["run"]
                cmd += self._add_run_args()
                for var in sec_env:
                    cmd += ["-e", var]
                for var in cfg_env:
                    cmd += ["-e", var]
                cmd += [self.image]
                if self.run_command:
                    cmd += self.run_command
            else:
                cmd += ["start", self.container_id]
        elif action == "stop":
            cmd += ["stop", self.container_id]
        elif action == "kill":
            cmd += ["kill", self.container_id]
        else:
            self.log.error("unsupported docker action: %s", action)
            return 1

        env = {}
        env.update(os.environ)
        env.update(sec_env)
        env.update(cfg_env)
        try:
            ret = self.vcall(cmd, warn_to_info=True, env=env)[0]
        except KeyboardInterrupt:
            self.log.error("%s timeout exceeded",
                           print_duration(self.start_timeout))
            if action == "start":
                cmd = self.lib.docker_cmd + ["kill", self.container_name]
                self.vcall(cmd, warn_to_info=True, env=env)
            ret = 1
        if not self.detach:
            signal.alarm(0)
        if ret != 0:
            raise ex.Error

        if action == "start":
            self.is_up_clear_cache()
        elif action in ("stop", "kill"):
            if self.rm:
                self.container_rm()
            self.is_up_clear_cache()
            self.lib.docker_stop()
Beispiel #12
0
    def _status(self, verbose=False):
        try:
            data = self.lsflash()
            self.get_last(data)
        except ex.Error as e:
            self.status_log(str(e))
            return core.status.WARN
        r = core.status.UP

        record_disabled = []
        persist_disabled = []
        record_enabled = []
        state_invalid = []

        for _data in data:
            if _data['Recording'] == "Disabled":
                record_disabled.append(_data['ID'])
            elif _data['Recording'] == "Enabled":
                record_enabled.append(_data['ID'])
            if _data['State'] != "Valid":
                state_invalid.append(_data['ID'])
            if _data['Persistent'] == "Disabled":
                persist_disabled.append(_data['ID'])

        if self.recording and len(record_disabled) > 0:
            self.status_log("Recording disabled on %s" %
                            ','.join(record_disabled))
            r = core.status.WARN
        elif not self.recording and len(record_enabled) > 0:
            self.status_log("Recording enabled on %s" %
                            ','.join(record_enabled))
            r = core.status.WARN
        if len(state_invalid) > 0:
            self.status_log("State not valid on %s" % ','.join(state_invalid))
            r = core.status.WARN
        if len(persist_disabled) > 0:
            self.status_log("Persistent disabled on %s" %
                            ','.join(persist_disabled))
            r = core.status.WARN

        pairs = []
        for d in data:
            if 'ID' not in d:
                continue
            pairs.append(d['ID'])
        missing = set(self.pairs) - set(pairs)
        missing = sorted(list(missing))
        if len(missing) > 0:
            self.status_log("Missing flashcopy on %s" % ','.join(missing))
            r = core.status.WARN

        if self.last is None:
            return core.status.WARN
        elif self.last < datetime.datetime.now() - datetime.timedelta(
                seconds=self.sync_max_delay):
            self.status_log("Last sync on %s older than %s" %
                            (self.last, print_duration(self.sync_max_delay)))
            return core.status.WARN
        elif r == core.status.WARN:
            return core.status.WARN
        self.status_log("Last sync on %s" % self.last)
        return core.status.UP