def housekeeping(cls, configs={}): ''' another background process, clean up the out-of-date textfile metrics. we assumed that there is ONLY 1 layer files under the `data_dir`. ''' data_dir = configs["backend"]["data_dir"] prom_textfile_style = configs["backend"]["prom_textfile_style"] prom_textfiles = glob.glob(data_dir + prom_textfile_style) client = APIClient(configs) current_container_ids = [cnter["Id"] for cnter in client.containers()] for metric_file in prom_textfiles: has_metric = False for container_id in current_container_ids: if container_id in metric_file: has_metric = True break else: continue if not has_metric: # clean up the out-of-date metric! os.remove(metric_file) print "Cleaned out-of-date metric file: {}".format(metric_file)
def main(): global LIVE_CONTAINERS print "CMonitor starting...\n" # Workaround for multithread _strptime bug: https://groups.google.com/forum/#!topic/comp.lang.python/VFIppfWTblw datetime.datetime.strptime("2015-04-05T14:20:00", "%Y-%m-%dT%H:%M:%S") # register signal signal.signal(signal.SIGTERM, sigterm_clean) signal.signal(signal.SIGINT, sigterm_clean) # load configs. # TODO: make it as a startup parameter. configs = {} config_file = os.getenv('CONFIG', '/config/config.yml') with open(config_file, 'r') as fp: configs = yaml.load(fp) cli = APIClient(configs=configs) # startup container metrics collector check_interval = configs["check_interval"] startup_containers = cli.containers() for c in startup_containers: print "Starting ContainerAppMetricsCollector for {}\n".format(c["Id"]) app_t = ContainerAppMetricsCollector(c["Id"], configs) app_t.setDaemon(True) app_t.start() LIVE_CONTAINERS[c["Id"]] = [app_t, ] print "CMonitor started...\n" while True: time.sleep(check_interval) # Backend housekeeping. # We setup a workaround here right now. BackendFactory(configs["backend"]["name"]).housekeeping(configs) # recollect all live containers states # now, we just mark up unknown as offline state. cstates = {} now_alive_containers = cli.containers() for c in now_alive_containers: cstates[c["Id"]] = c["State"] offlined_containers = set(LIVE_CONTAINERS.keys()) - set(cstates.keys()) onlined_containers = set(cstates.keys()) - set(LIVE_CONTAINERS.keys()) # rebuild live container crashed thread. for cid, crashed_t in LIVE_CONTAINERS.iteritems(): if crashed_t[0] is None or not crashed_t[0].is_alive(): app_t = ContainerAppMetricsCollector(cid, configs) app_t.setDaemon(True) app_t.start() print "Rebuild crashed app metrics thread for {}\n".format(cid) LIVE_CONTAINERS[cid][0] = app_t else: pass # offline some for offlineId in offlined_containers: try: LIVE_CONTAINERS[offlineId][0].stop() del LIVE_CONTAINERS[offlineId] print "Stopped or deleted old container {}\n".format(offlineId) except Exception, e: print "ERROR while stopping the container {} collector, may already stopped!!!\n".format(offlineId) continue # online some for onlineId in onlined_containers: try: app_t = ContainerAppMetricsCollector(onlineId, configs) app_t.setDaemon(True) app_t.start() LIVE_CONTAINERS[onlineId] = [app_t, ] print "appending new container {}, now lives {}\n".format(onlineId, LIVE_CONTAINERS.keys()) except Exception, e: print "Container {} append to LIVE_CONTAINERS ERROR, {}\n".format(onlineId, e) continue