Example #1
0
    def collect(self) -> None:
        """collect() is run every loop, collecting Plugin resources.

        Every time collect() is run it creates a new working Graph. It then creates instances of each Plugin
        and starts their thread which in turn runs the Plugin's collect() method. Once all Plugins have finished
        collecting cloud resources, it retrieves the Plugin's Graphs and appends them to its own working Graph.

        At the end the live Graph is swapped with the working Graph.
        """
        gc = GraphContainer(
            cache_graph=False
        )  # Create a new graph container to hold the Graph() which we'll swap out at the end
        dispatch_event(
            Event(EventType.COLLECT_BEGIN, gc.graph)
        )  # Let interested parties know that we're about to start our collect run
        plugins = [Plugin() for Plugin in self.plugins
                   ]  # Create instances of each Plugin()
        start_time = time.time()

        # First we run each Collector Plugin
        # Each Plugin is a threading.Thread so we call start() on it
        for plugin in plugins:
            plugin.start(
            )  # Run the collect() method on each plugin which in turn generates a Graph()

        # Now we wait for each Plugin to complete its work or time out
        # Because we always swap out the completed graph at the end of our collect run
        # it doesn't matter in which order we wait for (join) Plugins. I.e. there's no speed
        # advantage in checking for already completed Plugins and collecting slow ones last.
        for plugin in plugins:
            timeout = start_time + ArgumentParser.args.timeout - time.time()
            if timeout < 1:
                timeout = 1
            log.info(
                f'Waiting for collector thread of plugin {plugin.cloud} to finish'
            )
            plugin.join(timeout)
            if not plugin.is_alive():  # The plugin has finished its work
                if not is_directed_acyclic_graph(plugin.graph):
                    log.error(
                        f'Graph of plugin {plugin.cloud} is not acyclic - ignoring plugin results'
                    )
                    continue
                log.info(
                    f'Merging graph of plugin {plugin.cloud} with global graph'
                )
                gc.add(plugin.graph)
                gc.graph.add_edge(
                    gc.GRAPH_ROOT, plugin.root
                )  # Connect the root of our graph with the plugin's
            else:
                log.error(
                    f'Plugin {plugin.cloud} timed out - discarding Plugin graph'
                )
        sanitize(gc.graph, gc.GRAPH_ROOT)
        dispatch_event(Event(EventType.GENERATE_METRICS, gc.graph),
                       blocking=True)
        dispatch_event(Event(EventType.COLLECT_FINISH, gc.graph),
                       blocking=True)
        self.gc.graph = gc.graph  # Swap the live graph with the newly created one from our current run
def test_args():
    arg_parser = get_arg_parser()
    WebServer.add_args(arg_parser)
    GraphContainer.add_args(arg_parser)
    Processor.add_args(arg_parser)
    PluginLoader.add_args(arg_parser)
    event_add_args(arg_parser)
    arg_parser.parse_args()
    assert ArgumentParser.args.interval == 3600
Example #3
0
def test_graph_container():
    gc = GraphContainer(cache_graph=False)
    g = Graph()
    n1 = SomeTestResource("foo", {})
    n2 = SomeTestResource("bar", {})
    g.add_node(n1)
    gc.graph.add_resource(gc.GRAPH_ROOT, n2)
    gc.add(g)
    gc.graph.add_edge(n1, n2)
    assert len(gc.graph.nodes) == 3
    assert len(gc.graph.edges) == 2
    assert gc.graph.search_first("id", "bar") == n2
    assert gc.graph.search_first_parent_class(n2, SomeTestResource) == n1
Example #4
0
def test_processor():
    arg_parser = get_arg_parser()
    Processor.add_args(arg_parser)
    GraphContainer.add_args(arg_parser)
    event_add_args(arg_parser)
    arg_parser.parse_args()

    graph_container = GraphContainer(cache_graph=False)
    plugins = [SomeTestPlugin]

    processor = Processor(graph_container, plugins)
    processor.daemon = True
    processor.start()
    time.sleep(1)
    assert len(processor.gc.graph.nodes) == num_resources + 2
    processor.shutdown(Event(EventType.SHUTDOWN))
Example #5
0
def test_web():
    tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    tcp.bind(("", 0))
    _, free_port = tcp.getsockname()
    tcp.close()
    # fixme: race
    arg_parser = get_arg_parser()
    WebServer.add_args(arg_parser)
    event_add_args(arg_parser)
    arg_parser.parse_args()

    ArgumentParser.args.web_port = free_port

    gc = GraphContainer(cache_graph=False)
    web_server = WebServer(gc)
    web_server.daemon = True
    web_server.start()

    endpoint = f"http://localhost:{free_port}"

    r = requests.get(f"{endpoint}/health")
    assert r.content == b"ok\r\n"
def main() -> None:
    log.info("Cloudkeeper initializing")
    # Try to run in a new process group and
    # ignore if not possible for whatever reason
    try:
        os.setpgid(0, 0)
    except:
        pass

    cloudkeeper.signal.parent_pid = os.getpid()

    # Add cli args
    arg_parser = get_arg_parser()

    logging.add_args(arg_parser)
    Cli.add_args(arg_parser)
    WebServer.add_args(arg_parser)
    Scheduler.add_args(arg_parser)
    Processor.add_args(arg_parser)
    Cleaner.add_args(arg_parser)
    PluginLoader.add_args(arg_parser)
    GraphContainer.add_args(arg_parser)
    event_add_args(arg_parser)

    # Find cloudkeeper Plugins in the cloudkeeper.plugins module
    plugin_loader = PluginLoader()
    plugin_loader.add_plugin_args(arg_parser)

    # At this point the CLI, all Plugins as well as the WebServer have
    # added their args to the arg parser
    arg_parser.parse_args()

    # Handle Ctrl+c and other means of termination/shutdown
    cloudkeeper.signal.initializer()
    add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False)

    # Try to increase nofile and nproc limits
    increase_limits()

    # We're using a GraphContainer() to contain the graph which gets replaced
    # at runtime. This way we're not losing the context in other places like
    # the webserver when the graph gets reassigned.
    graph_container = GraphContainer()

    # GraphCollector() is a custom Prometheus Collector that
    # takes a graph and yields its metrics
    graph_collector = GraphCollector(graph_container)
    REGISTRY.register(graph_collector)

    # Scheduler() starts an APScheduler instance
    scheduler = Scheduler(graph_container)
    scheduler.daemon = True
    scheduler.start()

    # Cli() is the CLI Thread
    cli = Cli(graph_container, scheduler)
    cli.daemon = True
    cli.start()

    # WebServer is handed the graph container context so it can e.g. produce graphml
    # from it. The webserver serves Prometheus Metrics as well as different graph
    # endpoints.
    web_server = WebServer(graph_container)
    web_server.daemon = True
    web_server.start()

    for Plugin in plugin_loader.plugins(PluginType.PERSISTENT):
        try:
            log.debug(f"Starting persistent Plugin {Plugin}")
            plugin = Plugin()
            plugin.daemon = True
            plugin.start()
        except Exception as e:
            log.exception(f"Caught unhandled persistent Plugin exception {e}")

    processor = Processor(graph_container,
                          plugin_loader.plugins(PluginType.COLLECTOR))
    processor.daemon = True
    processor.start()

    # Dispatch the STARTUP event
    dispatch_event(Event(EventType.STARTUP))

    # We wait for the shutdown Event to be set() and then end the program
    # While doing so we print the list of active threads once per 15 minutes
    while not shutdown_event.is_set():
        log_stats()
        shutdown_event.wait(900)
    time.sleep(5)
    cloudkeeper.signal.kill_children(cloudkeeper.signal.SIGTERM,
                                     ensure_death=True)
    log.info("Shutdown complete")
    quit()
Example #7
0
def test_metrics():
    gc = GraphContainer(cache_graph=False)
    c = GraphCollector(gc)
    for metric in c.collect():
        assert type(metric) == GaugeMetricFamily
Example #8
0
def main() -> None:
    # Add cli args
    arg_parser = get_arg_parser()

    Cli.add_args(arg_parser)
    WebServer.add_args(arg_parser)
    Scheduler.add_args(arg_parser)
    Processor.add_args(arg_parser)
    Cleaner.add_args(arg_parser)
    PluginLoader.add_args(arg_parser)
    GraphContainer.add_args(arg_parser)
    event_add_args(arg_parser)

    # Find cloudkeeper Plugins in the cloudkeeper.plugins module
    plugin_loader = PluginLoader()
    plugin_loader.add_plugin_args(arg_parser)

    # At this point the CLI, all Plugins as well as the WebServer have added their args to the arg parser
    arg_parser.parse_args()

    # Write log to a file in addition to stdout
    if ArgumentParser.args.logfile:
        log_formatter = logging.Formatter(log_format)
        fh = logging.FileHandler(ArgumentParser.args.logfile)
        fh.setFormatter(log_formatter)
        logging.getLogger().addHandler(fh)

    # Handle Ctrl+c and other means of termination/shutdown
    signal_on_parent_exit()
    add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False)
    signal(SIGINT, signal_handler)
    signal(SIGTERM, signal_handler)
    signal(SIGUSR1, signal_handler)

    # We're using a GraphContainer() to contain the graph which gets replaced at runtime.
    # This way we're not losing the context in other places like the webserver when the
    # graph gets reassigned.
    graph_container = GraphContainer()

    # GraphCollector() is a custom Prometheus Collector that
    # takes a graph and yields its metrics
    graph_collector = GraphCollector(graph_container)
    REGISTRY.register(graph_collector)

    # Scheduler() starts an APScheduler instance
    scheduler = Scheduler(graph_container)
    scheduler.daemon = True
    scheduler.start()

    # Cli() is the CLI Thread
    cli = Cli(graph_container, scheduler)
    cli.daemon = True
    cli.start()

    # WebServer is handed the graph container context so it can e.g. produce graphml from it
    # The webserver serves Prometheus Metrics as well as different graph endpoints
    web_server = WebServer(graph_container)
    web_server.daemon = True
    web_server.start()

    for Plugin in plugin_loader.plugins(PluginType.PERSISTENT):
        try:
            log.debug(f'Starting persistent Plugin {Plugin}')
            plugin = Plugin()
            plugin.daemon = True
            plugin.start()
        except Exception as e:
            log.exception(f'Caught unhandled persistent Plugin exception {e}')

    collector = Processor(graph_container,
                          plugin_loader.plugins(PluginType.COLLECTOR))
    collector.daemon = True
    collector.start()

    # Dispatch the STARTUP event
    dispatch_event(Event(EventType.STARTUP))

    # We wait for the shutdown Event to be set() and then end the program
    # While doing so we print the list of active threads once per 15 minutes
    while not shutdown_event.is_set():
        log_stats()
        shutdown_event.wait(900)
    time.sleep(5)
    log.info('Shutdown complete')
    quit()