def test_append(self): self.q.put(NewCgroupMessage(Cgroup("/some/foo"))) self.q.put(StaleCgroupMessage(Cgroup("/some/foo"))) self.q.put(ExitMessage()) self.engine.run() self.assertHasLogged("foo", ["container has started", "container has exited"])
def test_ps_table(self): cg_path = create_random_cg(self.parent_cg_path) multithead_sleep = "\n".join([ 'import time, threading', 't = threading.Thread(target=time.sleep, args=(10,))', 't.start()', 't.join()' ]) subprocess.Popen(["sudo", "cgexec", "-g", descriptor_from_cg_path(cg_path), "sh", "-c", "sleep 10"]) subprocess.Popen(["sudo", "cgexec", "-g", descriptor_from_cg_path(cg_path), "python", "-c", multithead_sleep]) time.sleep(2) # Sleep for a little bit to let them spawn cg = Cgroup(cg_path) table = cg.ps_table() # We should see 3 processes (but there are 4 threads here) self.assertEqual(3, len(table)) by_name = {proc["name"]: proc for proc in table} self.assertEqual(["python", "sh", "sleep"], sorted(by_name.keys())) for name in ["sh", "sleep"]: proc = by_name[name] self.assertIsInstance(proc["pid"], int) self.assertIsInstance(proc["memory_info"].vms, int) self.assertIsInstance(proc["memory_info"].rss, int) self.assertIsInstance(proc["cmdline"], list) self.assertIn(proc["status"], PROC_STATUSES_RAW.keys())
def sync(self): logger.debug("syncing cgroups") # Sync all monitors with disk, and remove stale ones. It's important to # actually *wakeup* monitors here, so as to ensure we don't race with # Docker when it creates a cgroup (which could result in us not seeing # the memory limit and therefore not disabling the OOM killer). for cg in list(self._path_hash.values()): try: cg.wakeup(self.job_queue, raise_for_stale=True) except EnvironmentError: self.remove(cg) for entry in os.listdir(self.root_cg_path): path = os.path.join(self.root_cg_path, entry) # Is this a CG or just a regular file? if not os.path.isdir(path): continue # We're already tracking this CG. It *might* have changed between # our check and now, but in that case we'll catch it at the next # sync. if path in self._path_hash: continue # This a new CG, Register and wake it up immediately after, in case # there already is some handling to do (typically: disabling the # OOM killer). To avoid race conditions, we do this after # registration to ensure we can deregister immediately if the # cgroup just exited. cg = Cgroup(path) self.register(cg) cg.wakeup(self.job_queue)
def test_reclose(self): cg_path = create_random_cg(self.parent_cg_path) cg = Cgroup(cg_path) cg.open() cg.close() self.assertRaises(AssertionError, cg.close)
def test_trigger_restart(self): cg_path = create_random_cg(self.parent_cg_path) q = queue.Queue() cg = Cgroup(cg_path) cg.open() cg.wakeup(q) self.assertHasNoMessages(q) enable_memlimit_and_trigger_oom(cg_path) # The test program should fill 128 MB rather fast; give it 10s for _ in range(100): cg.wakeup(q) try: msg = q.get_nowait() except queue.Empty: time.sleep(0.1) continue self.assertIsInstance(msg, RestartRequestedMessage) self.assertEqual(cg, msg.cg) break else: raise Exception("Queue never received a message!") cg.close()
def setUp(self): self.mock_cg = tempfile.mkdtemp() self.write_oom_control() with open(self.cg_path("memory.pressure_level"), "w") as f: f.write('') self.monitor = Cgroup(self.mock_cg) self.queue = queue.Queue()
def test_set_memory_limit(self): cg_path = create_random_cg(self.parent_cg_path) # Memory limits are enforced as a page size count, so we have to make # sure we choose a number that's properly aligned. limit = 123 * resource.getpagesize() cg = Cgroup(cg_path) cg.set_memory_limit_in_bytes(limit) self.assertEqual(limit, cg.memory_limit_in_bytes())
def test_trigger_restart(self): cg_path = create_random_cg(self.parent_cg_path) q = queue.Queue() cg = Cgroup(cg_path) cg.open() cg.wakeup(q) self.assertRaises(queue.Empty, q.get_nowait) enable_memlimit_and_trigger_oom(cg_path) # The test program should fill 128 MB rather fast; give it 10s for _ in range(100): cg.wakeup(q) try: msg = q.get_nowait() except queue.Empty: time.sleep(0.1) continue self.assertIsInstance(msg, RestartRequestedMessage) self.assertEqual(cg, msg.cg) break else: raise Exception("Queue never received a message!") cg.close()
def enable_memlimit_and_trigger_oom(path): cg = Cgroup(path) cg.open() # Set a memory limit then disable the OOM killer via a wakeup cg.set_memory_limit_in_bytes(1024 * 1024 * 128) # 128 MB cg.wakeup(queue.Queue()) cg.close() test_program = 'l = []\nwhile True:\n l.append(object())' subprocess.Popen([ "sudo", "cgexec", "-g", descriptor_from_cg_path(path), "python", "-c", test_program ])
def test_wakeup_on_sync(self): cg_path = create_random_cg(self.parent_cg_path) cg = Cgroup(cg_path) cg.open() cg.set_memory_limit_in_bytes(1024) self.assertEqual("0", cg.oom_control_status()["oom_kill_disable"]) index = CgroupIndex(self.parent_cg_path, queue.Queue()) index.open() index.sync() index.close() self.assertEqual("1", cg.oom_control_status()["oom_kill_disable"]) cg.close()
def restart_one(root_cg, grace_period, container_id): q = queue.Queue() cg = Cgroup(os.path.join(root_cg, container_id)) try: restart(grace_period, cg, q, q) except IOError: logger.error("%s: container does not exist", cg.name()) return 1 finally: while not q.empty(): m = q.get() logger.debug("%s: received %s", cg.name(), m.__class__.__name__) return 0
def restart_one(root_cg_path, restart_adapter, restart_grace_period, container_id): q = queue.Queue() cg = Cgroup(os.path.join(root_cg_path, container_id)) try: restart(restart_adapter, restart_grace_period, cg, q, q) except IOError: logger.error("%s: container does not exist", cg.name()) return 1 finally: while not q.empty(): m = q.get() logger.debug("%s: received %s", cg.name(), m.__class__.__name__) return 0
def enable_memlimit_and_trigger_oom(path): cg = Cgroup(path) cg.open() # Set a memory limit then disable the OOM killer via a wakeup cg.set_memory_limit_in_bytes(1024 * 1024 * 128) # 128 MB cg.wakeup(queue.Queue()) cg.close() test_program = 'l = []\nwhile True:\n l.append(object())' subprocess.Popen(["sudo", "cgexec", "-g", _descriptor_from_cg_path(path), "python", "-c", test_program])
def test_no_pids(self): self.q.put(RestartCgroupMessage(Cgroup("/some/foo"), [])) self.q.put(ExitMessage()) self.engine.run() self.assertHasLogged("foo", [ "container exceeded its memory allocation", "container is restarting:", ])
def test_try_docker_without_retry(self): # This script will not succeed if run twice test_file = "{0}/foo".format(self.test_dir) shell = 'if test -f {0};' \ 'then exit 1; ' \ 'else touch {0} && exit 0; ' \ 'fi'.format(test_file, test_file) cmd = ['sh', '-c', shell] ret = docker.try_docker(Cgroup("/some/foo"), *cmd) self.assertTrue(ret)
def test_stale_cgroup(self): cg_path = create_random_cg(self.parent_cg_path) cg = Cgroup(cg_path) cg.open() delete_cg(cg_path) q = queue.Queue() cg.wakeup(q, None) self.assertRaises(EnvironmentError, cg.wakeup, q, None, raise_for_stale=True) cg.close()
def test_ps_table(self): cg_path = create_random_cg(self.parent_cg_path) subprocess.Popen([ "sudo", "cgexec", "-g", descriptor_from_cg_path(cg_path), "sh", "-c", "sleep 10" ]) time.sleep(2) # Sleep for a little bit to let them spawn cg = Cgroup(cg_path) table = cg.ps_table() self.assertEqual(2, len(table)) by_name = {proc["name"]: proc for proc in table} self.assertEqual(["sh", "sleep"], sorted(by_name.keys())) for name in ["sh", "sleep"]: proc = by_name[name] self.assertIsInstance(proc["pid"], int) self.assertIsInstance(proc["memory_info"].vms, int) self.assertIsInstance(proc["memory_info"].rss, int) self.assertIsInstance(proc["cmdline"], list) self.assertIn(proc["status"], PROC_STATUSES_RAW.keys())
def sync(self): logger.debug("syncing cgroups") # Sync all monitors with disk, and remove stale ones. It's important to # actually *wakeup* monitors here, so as to ensure we don't race with # Docker when it creates a cgroup (which could result in us not seeing # the memory limit and therefore not disabling the OOM killer). for cg in list(self._path_hash.values()): try: cg.wakeup(self.job_queue, raise_for_stale=True) except EnvironmentError: logger.info("%s: deregistering", cg.name()) self.remove(cg) for entry in os.listdir(self.root_cg_path): path = os.path.join(self.root_cg_path, entry) # Is this a CG or just a regular file? if not os.path.isdir(path): continue # We're already tracking this CG. It *might* have changed between # our check and now, but in that case we'll catch it at the next # sync. if path in self._path_hash: continue # This a new CG, register it. cg = Cgroup(path) logger.info("%s: new cgroup", cg.name()) # Register and wake up the CG immediately after, in case there # already is some handling to do (typically: disabling the OOM # killer). To avoid race conditions, we do this after registration # to ensure we can deregister immediately if the cgroup just # exited. self.register(cg) cg.wakeup(self.job_queue)
def _launch_container(self, options): cmd = ["docker", "run", "-d"] + options p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() if p.returncode: m = "{0} failed with status {1}:\n{2}\n{3}".format(cmd, p.returncode, out, err) self.fail(m) cid = out.decode("utf-8").strip() self._cids.append(cid) return Cgroup("/".join([CG_DOCKER_ROOT_DIR, cid]))
def test_stale_cgroup(self): cg_path = create_random_cg(self.parent_cg_path) cg = Cgroup(cg_path) cg.open() delete_cg(cg_path) q = queue.Queue() cg.wakeup(q) self.assertRaises(EnvironmentError, cg.wakeup, q, raise_for_stale=True) cg.close()
def test_large_memory_value(self): # Newer versions of Python (e.g. 3.4) will show this value in # scientific notation, which isn't desirable here. size = 2 * 1024 * 1024 * 1024 # 2GB MemInfo = namedtuple('MemInfo', ["rss", "vms"]) ps_table = [{ "pid": 123, "ppid": 0, "memory_info": MemInfo(rss=size, vms=size), "cmdline": ["some", "proc"], "status": psutil.STATUS_RUNNING, }] self.q.put(RestartCgroupMessage(Cgroup("/some/foo"), ps_table)) self.q.put(ExitMessage()) self.engine.run() self.assertHasLogged("foo", [ "container exceeded its memory allocation", "container is restarting:", re.compile(r"123\s+0\s+2097152\s+2097152\s+R\s+some proc"), ])
def test_restart_cgroup(self): MemInfo = namedtuple('MemInfo', ["rss", "vms"]) ps_table = [{ "pid": 123, "memory_info": MemInfo(rss=1024 * 8, vms=1024 * 16), "cmdline": ["some", "proc"], "status": psutil.STATUS_STOPPED, }, { "pid": 456, "memory_info": MemInfo(rss=1024 * 2, vms=1024 * 4), "cmdline": ["sh", "-c", "a && b"], "status": psutil.STATUS_RUNNING, }] self.q.put(RestartCgroupMessage(Cgroup("/some/foo"), ps_table)) self.q.put(ExitMessage()) self.engine.run() self.assertHasLogged("foo", [ "container exceeded its memory allocation", "container is restarting:", re.compile(r"123\s+16\s+8\s+T\s+some proc"), re.compile(r'456\s+4\s+2\s+R\s+sh -c "a && b"') ])
def sync(self): logger.debug("syncing cgroups") # Sync all monitors with disk, and deregister stale ones. It's # important to actually *wakeup* monitors here, so as to ensure we # don't race with Docker when it creates a cgroup (which could result # in us not seeing the memory limit and therefore not disabling the OOM # killer). for cg in list(self._path_hash.values()): try: cg.wakeup(self.job_queue, None, raise_for_stale=True) except EnvironmentError: self.deregister(cg) cg.close() for entry in os.listdir(self.root_cg_path): path = os.path.join(self.root_cg_path, entry) # Is this a CG or just a regular file? if not os.path.isdir(path): continue # We're already tracking this CG. It *might* have changed between # our check and now, but in that case we'll catch it at the next # sync. if path in self._path_hash: continue # This a new CG, Register and wake it up immediately after in case # there already is some handling to do (typically: disabling the # OOM killer). cg = Cgroup(path) try: cg.open() except EnvironmentError as e: # CG exited before we had a chance to register it. That's OK. logger.warning("%s: error opening new cg: %s", cg.name(), e) else: self.register(cg) cg.wakeup(self.job_queue, None)
def test_disable_oom_killer(self): cg_path = create_random_cg(self.parent_cg_path) cg = Cgroup(cg_path) cg.open() cg.wakeup(queue.Queue(), None) self.assertEqual("0", cg.oom_control_status()["oom_kill_disable"]) # The OOM Killer should be disabled if there is a task limit cg.set_memory_limit_in_bytes(1024) cg.wakeup(queue.Queue(), None) self.assertEqual("1", cg.oom_control_status()["oom_kill_disable"]) cg.close()
def test_disable_oom_killer(self): cg_path = create_random_cg(self.parent_cg_path) cg = Cgroup(cg_path) cg.open() cg.wakeup(queue.Queue()) self.assertEqual("0", cg.oom_control_status()["oom_kill_disable"]) # The OOM Killer should be disabled if there is a task limit cg.set_memory_limit_in_bytes(1024) cg.wakeup(queue.Queue()) self.assertEqual("1", cg.oom_control_status()["oom_kill_disable"]) cg.close()
def test_open_close(self): cg_path = create_random_cg(self.parent_cg_path) cg = Cgroup(cg_path) cg.open() cg.close()
def test_restart_timeout(self): self.q.put(RestartTimeoutMessage(Cgroup("/some/foo"), 3)) self.q.put(ExitMessage()) self.engine.run() self.assertHasLogged( "foo", ["container did not exit within 3 seconds grace period"])
def set_memlimit(path): logger.info("set memlimit in: %s", path) cg = Cgroup(path) cg.set_memory_limit_in_bytes(1024 * 1024 * 128) # 128 MB
def test_memory_stat(self): cg_path = create_random_cg(self.parent_cg_path) cg = Cgroup(cg_path) lines = cg.memory_stat_lines() self.assertEqual("cache 0", lines[0])
def test_new_cgroup(self): self.q.put(NewCgroupMessage(Cgroup("/some/foo"))) self.q.put(ExitMessage()) self.engine.run() self.assertHasLogged("foo", ["container has started"])
class CgroupTestUnit(unittest.TestCase): def setUp(self): self.mock_cg = tempfile.mkdtemp() self.monitor = Cgroup(self.mock_cg) self.queue = queue.Queue() def tearDown(self): shutil.rmtree(self.mock_cg) # Helpers def write_oom_control(self, oom_kill_disable="0", under_oom="0"): control = [ "oom_kill_disable {0}".format(oom_kill_disable), "under_oom {0}".format(under_oom) ] with open(self.cg_path("memory.oom_control"), "w") as f: f.write("\n".join(control)) f.write("\n") def write_memory_limit(self, memory_limit=9223372036854771712): with open(self.cg_path("memory.limit_in_bytes"), "w") as f: f.write(str(memory_limit)) f.write("\n") def cg_path(self, path): return os.path.join(self.mock_cg, path) # Tests def test_open(self): self.write_oom_control() self.monitor.open() evt_fileno = self.monitor.event_fileno() oom_control_fileno = self.monitor.oom_control.fileno() self.monitor.close() with open(self.cg_path("cgroup.event_control")) as f: e = "{0} {1}\n".format(evt_fileno, oom_control_fileno) self.assertEqual(e, f.read()) def test_wakeup_disable_oom_killer(self): self.write_oom_control() self.write_memory_limit(1024) self.monitor.open() self.monitor.wakeup(self.queue) self.monitor.close() with open(self.cg_path("memory.oom_control")) as f: self.assertEqual("1\n", f.read()) def test_wakeup_oom_killer_is_disabled(self): self.write_oom_control(oom_kill_disable="1") self.write_memory_limit(1024) self.monitor.open() self.monitor.wakeup(self.queue) self.monitor.close() # File shoud not have been touched with open(self.cg_path("memory.oom_control")) as f: self.assertEqual("oom_kill_disable 1\n", f.readline()) def test_wakeup_no_memory_limit(self): self.write_oom_control(oom_kill_disable="0") self.write_memory_limit() self.monitor.open() self.monitor.wakeup(self.queue) self.monitor.close() # File shoud not have been touched with open(self.cg_path("memory.oom_control")) as f: self.assertEqual("oom_kill_disable 0\n", f.readline()) def test_wakeup_stale(self): self.write_oom_control(oom_kill_disable="0") self.monitor.open() os.close(self.monitor.oom_control.fileno()) self.monitor.wakeup(self.queue) self.assertRaises(EnvironmentError, self.monitor.wakeup, self.queue, raise_for_stale=True) # Close the other FD manually. We still need to attempt closing the # wrapper to avoid a resource warning. os.close(self.monitor.event_fileno()) try: self.monitor.oom_control.close() except EnvironmentError: pass
def test_try_docker_and_wait_fatal(self): t0 = time.time() cmd = ['docker', 'restart', 'foobar'] ret = docker.try_docker(Cgroup("/some/foo"), *cmd) self.assertFalse(ret) self.assertLess(time.time() - t0, 1)
def setUp(self): self.mock_cg = tempfile.mkdtemp() self.monitor = Cgroup(self.mock_cg) self.queue = queue.Queue()
class CgroupTestUnit(unittest.TestCase, QueueAssertionHelper): def setUp(self): self.mock_cg = tempfile.mkdtemp() self.write_oom_control() with open(self.cg_path("memory.pressure_level"), "w") as f: f.write('') self.monitor = Cgroup(self.mock_cg) self.queue = queue.Queue() def tearDown(self): shutil.rmtree(self.mock_cg) # Helpers def write_oom_control(self, oom_kill_disable="0", under_oom="0"): control = ["oom_kill_disable {0}".format(oom_kill_disable), "under_oom {0}".format(under_oom)] with open(self.cg_path("memory.oom_control"), "w") as f: f.write("\n".join(control)) f.write("\n") def write_memory_usage(self, memory_usage=12345): with open(self.cg_path("memory.usage_in_bytes"), "w") as f: f.write(str(memory_usage)) f.write("\n") def write_memory_limit(self, memory_limit=9223372036854771712): with open(self.cg_path("memory.limit_in_bytes"), "w") as f: f.write(str(memory_limit)) f.write("\n") def cg_path(self, path): return os.path.join(self.mock_cg, path) # Tests def test_open(self): self.write_oom_control() self.monitor.open() expected = "{0} {1}\n{2} {3} {4}\n".format( self.monitor.event_oom.fileno(), self.monitor.oom_control.fileno(), self.monitor.event_pressure.fileno(), self.monitor.memory_pressure.fileno(), "critical" ) self.monitor.close() with open(self.cg_path("cgroup.event_control")) as f: self.assertEqual(expected, f.read()) def test_wakeup_with_limits_and_disable_oom_killer(self): self.write_oom_control() self.write_memory_limit(1024) self.monitor.open() self.monitor.wakeup(self.queue, None) self.monitor.close() with open(self.cg_path("memory.oom_control")) as f: self.assertEqual("1\n", f.read()) def test_wakeup_with_oom_killer_disabled_is_noop(self): self.write_oom_control(oom_kill_disable="1") self.write_memory_limit(1024) self.monitor.open() self.monitor.wakeup(self.queue, None) self.monitor.close() # File shoud not have been touched with open(self.cg_path("memory.oom_control")) as f: self.assertEqual("oom_kill_disable 1\n", f.readline()) def test_wakeup_without_limits_is_noop(self): self.write_oom_control(oom_kill_disable="0") self.write_memory_limit() self.monitor.open() self.monitor.wakeup(self.queue, None) self.monitor.close() # File shoud not have been touched with open(self.cg_path("memory.oom_control")) as f: self.assertEqual("oom_kill_disable 0\n", f.readline()) def test_wakeup_with_stale_group_does_not_raise(self): self.write_oom_control(oom_kill_disable="0") self.monitor.open() os.close(self.monitor.oom_control.fileno()) self.monitor.wakeup(self.queue, None) self.assertRaises(EnvironmentError, self.monitor.wakeup, None, self.queue, raise_for_stale=True) # Close the other FD manually. We still need to attempt closing the # wrapper to avoid a resource warning. os.close(self.monitor.event_oom.fileno()) try: self.monitor.oom_control.close() except EnvironmentError: pass def test_wakeup_under_oom_requests_restart(self): self.monitor.open() self.write_oom_control(oom_kill_disable="1", under_oom="1") self.monitor.wakeup(self.queue, None) self.assertHasMessageForCg( self.queue, RestartRequestedMessage, self.mock_cg ) def test_wakeup_memory_pressure_notifies(self): self.monitor.open() self.write_memory_usage() self.write_memory_limit() self.monitor.wakeup( self.queue, self.monitor.event_pressure.fileno() ) self.assertHasMessageForCg( self.queue, MemoryPressureMessage, self.mock_cg ) def test_pressure_wakeup_with_stale_group_does_not_raise(self): self.monitor.open() self.monitor.wakeup( self.queue, self.monitor.event_pressure.fileno() ) self.assertHasMessageForCg( self.queue, MemoryPressureMessage, self.mock_cg )