def collect(self) -> None: """collect() is run every loop, collecting Plugin resources. Every time collect() is run it creates a new working Graph. It then creates instances of each Plugin and starts their thread which in turn runs the Plugin's collect() method. Once all Plugins have finished collecting cloud resources, it retrieves the Plugin's Graphs and appends them to its own working Graph. At the end the live Graph is swapped with the working Graph. """ gc = GraphContainer( cache_graph=False ) # Create a new graph container to hold the Graph() which we'll swap out at the end dispatch_event( Event(EventType.COLLECT_BEGIN, gc.graph) ) # Let interested parties know that we're about to start our collect run plugins = [Plugin() for Plugin in self.plugins ] # Create instances of each Plugin() start_time = time.time() # First we run each Collector Plugin # Each Plugin is a threading.Thread so we call start() on it for plugin in plugins: plugin.start( ) # Run the collect() method on each plugin which in turn generates a Graph() # Now we wait for each Plugin to complete its work or time out # Because we always swap out the completed graph at the end of our collect run # it doesn't matter in which order we wait for (join) Plugins. I.e. there's no speed # advantage in checking for already completed Plugins and collecting slow ones last. for plugin in plugins: timeout = start_time + ArgumentParser.args.timeout - time.time() if timeout < 1: timeout = 1 log.info( f'Waiting for collector thread of plugin {plugin.cloud} to finish' ) plugin.join(timeout) if not plugin.is_alive(): # The plugin has finished its work if not is_directed_acyclic_graph(plugin.graph): log.error( f'Graph of plugin {plugin.cloud} is not acyclic - ignoring plugin results' ) continue log.info( f'Merging graph of plugin {plugin.cloud} with global graph' ) gc.add(plugin.graph) gc.graph.add_edge( gc.GRAPH_ROOT, plugin.root ) # Connect the root of our graph with the plugin's else: log.error( f'Plugin {plugin.cloud} timed out - discarding Plugin graph' ) sanitize(gc.graph, gc.GRAPH_ROOT) dispatch_event(Event(EventType.GENERATE_METRICS, gc.graph), blocking=True) dispatch_event(Event(EventType.COLLECT_FINISH, gc.graph), blocking=True) self.gc.graph = gc.graph # Swap the live graph with the newly created one from our current run
def test_args(): arg_parser = get_arg_parser() WebServer.add_args(arg_parser) GraphContainer.add_args(arg_parser) Processor.add_args(arg_parser) PluginLoader.add_args(arg_parser) event_add_args(arg_parser) arg_parser.parse_args() assert ArgumentParser.args.interval == 3600
def test_graph_container(): gc = GraphContainer(cache_graph=False) g = Graph() n1 = SomeTestResource("foo", {}) n2 = SomeTestResource("bar", {}) g.add_node(n1) gc.graph.add_resource(gc.GRAPH_ROOT, n2) gc.add(g) gc.graph.add_edge(n1, n2) assert len(gc.graph.nodes) == 3 assert len(gc.graph.edges) == 2 assert gc.graph.search_first("id", "bar") == n2 assert gc.graph.search_first_parent_class(n2, SomeTestResource) == n1
def test_processor(): arg_parser = get_arg_parser() Processor.add_args(arg_parser) GraphContainer.add_args(arg_parser) event_add_args(arg_parser) arg_parser.parse_args() graph_container = GraphContainer(cache_graph=False) plugins = [SomeTestPlugin] processor = Processor(graph_container, plugins) processor.daemon = True processor.start() time.sleep(1) assert len(processor.gc.graph.nodes) == num_resources + 2 processor.shutdown(Event(EventType.SHUTDOWN))
def test_web(): tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) tcp.bind(("", 0)) _, free_port = tcp.getsockname() tcp.close() # fixme: race arg_parser = get_arg_parser() WebServer.add_args(arg_parser) event_add_args(arg_parser) arg_parser.parse_args() ArgumentParser.args.web_port = free_port gc = GraphContainer(cache_graph=False) web_server = WebServer(gc) web_server.daemon = True web_server.start() endpoint = f"http://localhost:{free_port}" r = requests.get(f"{endpoint}/health") assert r.content == b"ok\r\n"
def main() -> None: log.info("Cloudkeeper initializing") # Try to run in a new process group and # ignore if not possible for whatever reason try: os.setpgid(0, 0) except: pass cloudkeeper.signal.parent_pid = os.getpid() # Add cli args arg_parser = get_arg_parser() logging.add_args(arg_parser) Cli.add_args(arg_parser) WebServer.add_args(arg_parser) Scheduler.add_args(arg_parser) Processor.add_args(arg_parser) Cleaner.add_args(arg_parser) PluginLoader.add_args(arg_parser) GraphContainer.add_args(arg_parser) event_add_args(arg_parser) # Find cloudkeeper Plugins in the cloudkeeper.plugins module plugin_loader = PluginLoader() plugin_loader.add_plugin_args(arg_parser) # At this point the CLI, all Plugins as well as the WebServer have # added their args to the arg parser arg_parser.parse_args() # Handle Ctrl+c and other means of termination/shutdown cloudkeeper.signal.initializer() add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False) # Try to increase nofile and nproc limits increase_limits() # We're using a GraphContainer() to contain the graph which gets replaced # at runtime. This way we're not losing the context in other places like # the webserver when the graph gets reassigned. graph_container = GraphContainer() # GraphCollector() is a custom Prometheus Collector that # takes a graph and yields its metrics graph_collector = GraphCollector(graph_container) REGISTRY.register(graph_collector) # Scheduler() starts an APScheduler instance scheduler = Scheduler(graph_container) scheduler.daemon = True scheduler.start() # Cli() is the CLI Thread cli = Cli(graph_container, scheduler) cli.daemon = True cli.start() # WebServer is handed the graph container context so it can e.g. produce graphml # from it. The webserver serves Prometheus Metrics as well as different graph # endpoints. web_server = WebServer(graph_container) web_server.daemon = True web_server.start() for Plugin in plugin_loader.plugins(PluginType.PERSISTENT): try: log.debug(f"Starting persistent Plugin {Plugin}") plugin = Plugin() plugin.daemon = True plugin.start() except Exception as e: log.exception(f"Caught unhandled persistent Plugin exception {e}") processor = Processor(graph_container, plugin_loader.plugins(PluginType.COLLECTOR)) processor.daemon = True processor.start() # Dispatch the STARTUP event dispatch_event(Event(EventType.STARTUP)) # We wait for the shutdown Event to be set() and then end the program # While doing so we print the list of active threads once per 15 minutes while not shutdown_event.is_set(): log_stats() shutdown_event.wait(900) time.sleep(5) cloudkeeper.signal.kill_children(cloudkeeper.signal.SIGTERM, ensure_death=True) log.info("Shutdown complete") quit()
def test_metrics(): gc = GraphContainer(cache_graph=False) c = GraphCollector(gc) for metric in c.collect(): assert type(metric) == GaugeMetricFamily
def main() -> None: # Add cli args arg_parser = get_arg_parser() Cli.add_args(arg_parser) WebServer.add_args(arg_parser) Scheduler.add_args(arg_parser) Processor.add_args(arg_parser) Cleaner.add_args(arg_parser) PluginLoader.add_args(arg_parser) GraphContainer.add_args(arg_parser) event_add_args(arg_parser) # Find cloudkeeper Plugins in the cloudkeeper.plugins module plugin_loader = PluginLoader() plugin_loader.add_plugin_args(arg_parser) # At this point the CLI, all Plugins as well as the WebServer have added their args to the arg parser arg_parser.parse_args() # Write log to a file in addition to stdout if ArgumentParser.args.logfile: log_formatter = logging.Formatter(log_format) fh = logging.FileHandler(ArgumentParser.args.logfile) fh.setFormatter(log_formatter) logging.getLogger().addHandler(fh) # Handle Ctrl+c and other means of termination/shutdown signal_on_parent_exit() add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False) signal(SIGINT, signal_handler) signal(SIGTERM, signal_handler) signal(SIGUSR1, signal_handler) # We're using a GraphContainer() to contain the graph which gets replaced at runtime. # This way we're not losing the context in other places like the webserver when the # graph gets reassigned. graph_container = GraphContainer() # GraphCollector() is a custom Prometheus Collector that # takes a graph and yields its metrics graph_collector = GraphCollector(graph_container) REGISTRY.register(graph_collector) # Scheduler() starts an APScheduler instance scheduler = Scheduler(graph_container) scheduler.daemon = True scheduler.start() # Cli() is the CLI Thread cli = Cli(graph_container, scheduler) cli.daemon = True cli.start() # WebServer is handed the graph container context so it can e.g. produce graphml from it # The webserver serves Prometheus Metrics as well as different graph endpoints web_server = WebServer(graph_container) web_server.daemon = True web_server.start() for Plugin in plugin_loader.plugins(PluginType.PERSISTENT): try: log.debug(f'Starting persistent Plugin {Plugin}') plugin = Plugin() plugin.daemon = True plugin.start() except Exception as e: log.exception(f'Caught unhandled persistent Plugin exception {e}') collector = Processor(graph_container, plugin_loader.plugins(PluginType.COLLECTOR)) collector.daemon = True collector.start() # Dispatch the STARTUP event dispatch_event(Event(EventType.STARTUP)) # We wait for the shutdown Event to be set() and then end the program # While doing so we print the list of active threads once per 15 minutes while not shutdown_event.is_set(): log_stats() shutdown_event.wait(900) time.sleep(5) log.info('Shutdown complete') quit()