def run(self): # if the metadata interface does not exist, not connecting is ok metadata = shell("ifconfig metadata 2>&1 || true") if "Device not found" in metadata: return True ps = shell("ps ax|grep exporter") if not "metadata-exporter" in ps: return False # is it a service? status = shell("systemctl status metadata-exporter") if not "running" in status: return False print("Subscribing to ZMQ socket on tcp://172.17.0.1:5556") context = zmq.Context() sub = context.socket(zmq.SUB) sub.connect("tcp://172.17.0.1:5556") sub.setsockopt(zmq.SUBSCRIBE, '') poller = zmq.Poller() poller.register(sub, zmq.POLLIN) socks = dict(poller.poll(60000)) if socks: if socks.get(sub) == zmq.POLLIN: return True else: print("Timeout.") return False
def run(self): # dpkg -l 'ih' means: Desired=Installed, Status=Half-inst packages = shell("dpkg -l|grep -E ^ih|awk '{print $2}'").strip().split( "\n") for pkg in packages: shell("apt-get install -y --allow-unauthenticated --reinstall %s" % (pkg, ), timeout=60)
def run(self): # is the watcher running ps = shell("ps ax|grep autotunnel") if not "watcher" in ps: return False status = shell("systemctl status autotunnel") if not "running" in status: return False return True
def run(self): # only run these tests if docker is installed docker = shell("docker --version") if not "1.10" in docker: return True images = shell("docker images") if not "monroe/base" in images: return False return True
def run(self): # is network listener running ps = shell("ps ax|grep listener") if not "network-listener" in ps: return False # is it a service? status = shell("systemctl status network-listener") if not "running" in status: return False return True
def run(self): # is network listener running ps = shell("ps ax|grep dlb") if not "sbin/dlb" in ps: return False # is it a service? status = shell("systemctl status dlb") if not "running" in status: return False return True
def run(self): last = shell("cat /tmp/last_seen_docker") if "No such file" in last: # after reboot, the file does not exist. # In that case, we reset the timer and wait for the regular timeout. shell("date +%s > /tmp/last_seen_docker") else: last = int(last) if (int(time.time()) - last) > 3600: return trigger_maintenance( "docker service has not run for 60 minutes.")
def run(self): """check if we can successfully reach the backend""" tunnel = shell('ssh -o StrictHostKeychecking=no -i $BACKEND_SSH_KEY -o ConnectTimeout=5 -o BatchMode=yes -o UserKnownHostsFile=/dev/null $BACKEND_SSH_USER@$BACKEND_SSH_SERVER echo success', source='/etc/default/autotunnel') if not "success" in tunnel: return False # store last successful connection in a file shell('mkdir -p /var/lib/biteback/') shell('date +%s > /var/lib/biteback/autotunnel.last') return True
def run(self): last = None try: last = int(shell('cat /var/lib/biteback/autotunnel.last')) except: pass if last is None: return # Ignore uptime = int(shell("tuptime -s --csv --tsince %s | tail -n 1 | cut -d, -f2 | tr -d \\\"" % last)) # uptime since last tunnel if uptime > 259200: # three days return trigger_reinstall()
def run(self): """check if we can successfully reach the backend""" tunnel = shell( 'ssh -o StrictHostKeychecking=no -i $BACKEND_SSH_KEY -o ConnectTimeout=5 -o BatchMode=yes -o UserKnownHostsFile=/dev/null $BACKEND_SSH_USER@$BACKEND_SSH_SERVER echo success', source='/etc/default/autotunnel') if not "success" in tunnel: return False # store last successful connection in a file shell('mkdir -p /var/lib/biteback/') shell('date +%s > /var/lib/biteback/autotunnel.last') return True
def run(self): hddleft = int(shell("df / --output=avail|tail -n1")) if hddleft < 500000: return False hddleft = int(shell("df /tmp --output=avail|tail -n1")) if hddleft < 10000: return False hddleft = int(shell("df /var/log --output=avail|tail -n1")) if hddleft < 10000: return False files = int(shell("ls /tmp | wc -l")) if files > 1000: return False return True
def run(self): last = None try: last = int(shell('cat /var/lib/biteback/autotunnel.last')) except: pass if last is None: return # Ignore uptime = int( shell( "tuptime -s --csv --tsince %s | tail -n 1 | cut -d, -f2 | tr -d \\\"" % last)) # uptime since last tunnel if uptime > 259200: # three days return trigger_reinstall()
def run(self): # if docker is not installed, we assume that is intentional installed = shell("dpkg -l|grep docker-engine") if not "ii" in installed: return True ps = shell("ps ax|grep docker") if not "--bip" in ps: return False status = shell("systemctl status docker") if not "active (running)" in status: return False return True
def run(self): mod = shell("lsmod") if not "k10temp" in mod: return False if not "sp5100_tco" in mod: return False return True
def run(self): mode = shell( "curl -s http://localhost:88/modems|jq '.[]|select(.ifname == \"wwan0\")|.mode'" ) if mode == "null": return False return True
def run(self): # first check if it configured, if no, ignore ls = shell("ls /etc/marvind.conf") if "No such file" in ls: return True # and we have keys ls = shell("ls /etc/keys/marvind") if "No such file" in ls: return True ps = shell("ps ax|grep marvind") if not "bin/marvind" in ps: return False # is it a service? status = shell("systemctl status marvind") if not "running" in status: return False return True
def run(self): status = shell("systemctl status docker -l") if "Unable to take ownership of thin-pool" in status: return False if "Possibly using a different" in status: #...thin pool than last invocation return False if "Base Device UUID and Filesystem verification failed" in status: return False return True
def run(self): metadata = shell("ip netns exec monroe metadata | head -c 6", timeout=60) if "Cannot" in metadata: print "Netns monroe does not exist. Ignoring" return True if "MONROE" in metadata: return True return False
def run(self): shell("systemctl stop docker", timeout=60) # should be stopped, just in case shell( "dmsetup ls|grep docker-|cut -f1 -d'('|sort|xargs dmsetup remove", timeout=60) # remove any stale leases on the thinpool shell("lvremove -f /dev/mapper/vg--monroe-tp--docker", timeout=60) # remove the thinpool device shell( "systemctl start docker" ) # will fail, but remove the systemctl status message this test triggers on
def run(self): temp = shell("/etc/munin/plugins/temp").split(" ")[1] if len(temp) > 0 and temp[0].isdigit(): # only convert if temp has at least a digit if float(temp) > 100.0: return False # Temp is either below max threshold, or in a virtual environment (e.g. qemu) # without sensors (temp is empty/non-digit). All good. return True
def run(self): shell("systemctl enable autotunnel") shell("systemctl restart autotunnel")
def run(self): shell( "apt-get install -y --force-yes --reinstall munin-plugins-monroe munin-node-c", timeout=60)
def run(self): shell("systemctl enable sshd") shell("systemctl restart sshd")
def run(self): shell("apt-get install -y --force-yes --reinstall autotunnel")
def run(self): cron = shell("cat /etc/cron.d/ansible-wrapper") if not "/usr/bin/ansible-wrapper" in cron: return False return True
def run(self): shell("apt-get install -y --force-yes --reinstall docker-engine", timeout=160)
def run(self): shell("apt-get install -y --force-yes --reinstall network-listener", timeout=60)
def run(self): shell("systemctl enable dlb") shell("systemctl restart dlb")
def run(self): shell("apt-get install -y --force-yes --reinstall python-marvin", timeout=60)
def run(self): shell("systemctl enable marvind") shell("systemctl restart marvind")
def run(self): # identify not completely installed packages in dpkg status = shell("dpkg -l|grep -E ^i|grep -vE ^ii") if status: return False return True
def run(self): # dpkg -l 'ih' means: Desired=Installed, Status=Half-inst packages = shell("dpkg -l|grep -E ^ih|awk '{print $2}'").strip().split("\n") for pkg in packages: shell("apt-get install -y --allow-unauthenticated --reinstall %s" % (pkg,), timeout=60)
def run(self): ps = shell("ps ax|grep sshd") if not "sbin/sshd" in ps: return False return True
def run(self): running = int(shell("atq | grep = | wc -l")) if running > 1: return False return True
def run(self): shell("dpkg --configure -a")
def run(self): shell("systemctl restart network-listener")
def run(self): shell("systemctl enable rsyslog") shell("systemctl restart rsyslog")
def run(self): mode = shell("curl -s http://localhost:88/modems|jq '.[]|select(.ifname == \"wwan0\")|.mode'") if mode == "null": return False return True
def run(self): shell("echo '*/20 * * * * root /usr/bin/ansible-wrapper &>/dev/null' > /etc/cron.d/ansible-wrapper")
def run(self): shell("rm /var/spool/cron/atjobs/=*") shell("mv /var/log/marvind.log /var/log/marvind.log.atq") shell("systemctl restart marvind")
def run(self): # does the munin server reply ps = shell('echo -e list\\\\nquit\\\\n|nc localhost 4949') if not "cpu" in ps: return False return True
def run(self): shell("circle restart")
def run(self): shell("systemctl enable metadata-exporter") shell("systemctl restart metadata-exporter")
def run(self): shell("systemctl enable network-listener") shell("systemctl restart network-listener")
def run(self): shell("apt-get install -y --force-yes --reinstall metadata-exporter", timeout=60)
def run(self): ps = shell("ps ax|grep rsyslog") if not "sbin/rsyslog" in ps: return False return True
def run(self): shell("docker stop -t 0 $(docker ps -q)") shell("systemctl enable docker") shell("systemctl restart docker")
def run(self): shell("depmod") shell("modprobe k10temp sp5100_tco") shell("systemctl start watchdog")
def run(self): shell("apt-get install -y --force-yes --reinstall dlb", timeout=60)