def core_actions_processor(metrics: Metrics, query_uri: str, message: dict) -> None: if not isinstance(message, dict): log.error(f"Invalid message: {message}") return kind = message.get("kind") message_type = message.get("message_type") data = message.get("data") log.debug( f"Received message of kind {kind}, type {message_type}, data: {data}") if kind == "action": try: if message_type == "generate_metrics": start_time = time.time() update_metrics(metrics, query_uri) run_time = time.time() - start_time log.debug(f"Updated metrics for {run_time:.2f} seconds") else: raise ValueError(f"Unknown message type {message_type}") except Exception as e: log.exception(f"Failed to {message_type}: {e}") reply_kind = "action_error" else: reply_kind = "action_done" reply_message = { "kind": reply_kind, "message_type": message_type, "data": data, } return reply_message
def __delitem__(self, key): if self.parent_resource and isinstance(self.parent_resource, BaseResource): log.debug(f"Calling parent resource to delete tag {key} in cloud") try: if self.parent_resource.delete_tag(key): log_msg = f"Successfully deleted tag {key} in cloud" self.parent_resource._changes.add("tags") self.parent_resource.log(log_msg) log.info((f"{log_msg} for {self.parent_resource.kind}" f" {self.parent_resource.id}")) return super().__delitem__(key) else: log_msg = f"Error deleting tag {key} in cloud" self.parent_resource.log(log_msg) log.error((f"{log_msg} for {self.parent_resource.kind}" f" {self.parent_resource.id}")) except Exception as e: log_msg = ( f"Unhandled exception while trying to delete tag {key} in cloud:" f" {type(e)} {e}") self.parent_resource.log(log_msg, exception=e) if self.parent_resource._raise_tags_exceptions: raise else: log.exception(log_msg) else: return super().__delitem__(key)
def collect_project(project_id: str, args=None, credentials=None) -> Optional[Dict]: """Collects an individual project. Is being called in collect() and either run within a thread or a spawned process. Depending on whether `--gcp-fork` was specified or not. Because the spawned process does not inherit any of our memory or file descriptors we are passing the already parsed `args` Namespace() to this method. """ project = GCPProject(project_id, {}) collector_name = f"gcp_{project.id}" resotolib.signal.set_thread_name(collector_name) if args is not None: ArgumentParser.args = args setup_logger("resotoworker-gcp") if credentials is not None: Credentials._credentials = credentials Credentials._initialized = True log.debug(f"Starting new collect process for project {project.dname}") try: gpc = GCPProjectCollector(project) gpc.collect() except Exception: log.exception( f"An unhandled error occurred while collecting {project.rtdname}" ) else: return gpc.graph
def on_message(self, ws, message): try: message: Dict = json.loads(message) except json.JSONDecodeError: log.exception(f"Unable to decode received message {message}") return self.queue.put(message)
def collect_account(account: AWSAccount, regions: List, args=None): collector_name = f"aws_{account.id}" resotolib.signal.set_thread_name(collector_name) if args is not None: ArgumentParser.args = args setup_logger("resotoworker-aws") log.debug(f"Starting new collect process for account {account.dname}") aac = AWSAccountCollector(regions, account) try: aac.collect() except botocore.exceptions.ClientError as e: log.exception( f"An AWS {e.response['Error']['Code']} error occurred while collecting account {account.dname}" ) metrics_unhandled_account_exceptions.labels( account=account.dname).inc() except Exception: log.exception( f"An unhandled error occurred while collecting AWS account {account.dname}" ) metrics_unhandled_account_exceptions.labels( account=account.dname).inc() return aac.graph
def dispatch_event(event: Event, blocking: bool = False) -> None: """Dispatch an Event""" waiting_str = "" if blocking else "not " log.debug( f"Dispatching event {event.event_type.name} and {waiting_str}waiting for" " listeners to return") if event.event_type not in _events.keys(): return with _events_lock.read_access: # Event listeners might unregister themselves during event dispatch # so we will work on a shallow copy while processing the current event. listeners = dict(_events[event.event_type]) threads = {} for listener, listener_data in listeners.items(): try: if listener_data["pid"] != os.getpid(): continue if listener_data["one-shot"] and not listener_data["lock"].acquire( blocking=False): log.error(f"Not calling one-shot listener {listener} of type" f" {type(listener)} - can't acquire lock") continue log.debug(f"Calling listener {listener} of type {type(listener)}" f" (blocking: {listener_data['blocking']})") thread_name = (f"{event.event_type.name.lower()}_event" f"-{getattr(listener, '__name__', 'anonymous')}") t = Thread(target=listener, args=[event], name=thread_name) if blocking or listener_data["blocking"]: threads[t] = listener t.start() except Exception: log.exception("Caught unhandled event callback exception") finally: if listener_data["one-shot"]: log.debug( f"One-shot specified for event {event.event_type.name} " f"listener {listener} - removing event listener") remove_event_listener(event.event_type, listener) listener_data["lock"].release() start_time = time.time() for thread, listener in threads.items(): timeout = start_time + listeners[listener]["timeout"] - time.time() if timeout < 1: timeout = 1 log.debug( f"Waiting up to {timeout:.2f}s for event listener {thread.name} to finish" ) thread.join(timeout) log.debug( f"Event listener {thread.name} finished (timeout: {thread.is_alive()})" )
def validate_dataclass(node: BaseResource): for field in fields(node): value = getattr(node, field.name) try: check_type(str(value), value, field.type) except TypeError: log.exception( f"In {node.rtdname} expected {field.name}" f" type {field.type} ({type(field.type)})" f" for value {value} ({type(value)})" )
def worker(self) -> None: while not self.shutdown_event.is_set(): message = self.queue.get() log.debug(f"{self.identifier} received: {message}") if self.message_processor is not None and callable(self.message_processor): try: result = self.message_processor(message) log.debug(f"Sending reply {result}") self.ws.send(json.dumps(result)) except Exception: log.exception(f"Something went wrong while processing {message}") self.queue.task_done()
def bootstrap(self) -> bool: if ArgumentParser.args.cleanup_volumes: try: self.age = parse_delta(ArgumentParser.args.cleanup_volumes_age) log.debug(f"Volume Cleanup Plugin Age {self.age}") except ValueError: log.exception( f"Error while parsing Volume Cleanup Age {ArgumentParser.args.volclean_age}" ) else: return True return False
def pre_cleanup(self, graph=None) -> bool: if not hasattr(self, "pre_delete"): return True if graph is None: graph = self._graph if self.phantom: raise RuntimeError( f"Can't cleanup phantom resource {self.rtdname}") if self.cleaned: log.debug(f"Resource {self.rtdname} has already been cleaned up") return True account = self.account(graph) region = self.region(graph) if not isinstance(account, BaseAccount) or not isinstance( region, BaseRegion): log.error( ("Could not determine account or region for pre cleanup of" f" {self.rtdname}")) return False log_suffix = f" in account {account.dname} region {region.name}" self.log("Trying to run pre clean up") log.debug(f"Trying to run pre clean up {self.rtdname}{log_suffix}") try: if not getattr(self, "pre_delete")(graph): self.log("Failed to run pre clean up") log.error( f"Failed to run pre clean up {self.rtdname}{log_suffix}") return False self.log("Successfully ran pre clean up") log.info( f"Successfully ran pre clean up {self.rtdname}{log_suffix}") except Exception as e: self.log("An error occurred during pre clean up", exception=e) log.exception( f"An error occurred during pre clean up {self.rtdname}{log_suffix}" ) cloud = self.cloud(graph) metrics_resource_pre_cleanup_exceptions.labels( cloud=cloud.name, account=account.dname, region=region.name, kind=self.kind, ).inc() return False return True
def on_message(self, ws, message): try: message: Dict = json.loads(message) except json.JSONDecodeError: log.exception(f"Unable to decode received message {message}") return log.debug(f"{self.identifier} received: {message}") if self.message_processor is not None and callable(self.message_processor): try: result = self.message_processor(message) log.debug(f"Sending reply {result}") ws.send(json.dumps(result)) except Exception: log.exception(f"Something went wrong while processing {message}")
def bootstrap(self) -> bool: if ArgumentParser.args.cleanup_aws_loadbalancers: try: self.age = parse_delta( ArgumentParser.args.cleanup_aws_loadbalancers_age) log.debug(f"AWS Loadbalancer Cleanup Plugin Age {self.age}") except ValueError: log.exception( f"Error while parsing AWS Loadbalancer " f"Cleanup Age {ArgumentParser.args.cleanup_aws_loadbalancers_age}" ) else: return True return False
def catch_and_log(*args, **kwargs): try: return f(*args, **kwargs) except do_raise: raise except Exception: args_str = ", ".join([repr(arg) for arg in args]) kwargs_str = ", ".join( [f"{k}={repr(v)}" for k, v in kwargs.items()]) if len(args) > 0 and len(kwargs) > 0: args_str += ", " log.exception( f"Caught exception in {f.__name__}({args_str}{kwargs_str})" )
def clean(self, node: BaseResource) -> None: log_prefix = f"Resource {node.rtdname} is marked for removal" if ArgumentParser.args.cleanup_dry_run: log.debug( f"{log_prefix}, not calling cleanup method because of dry run flag" ) return log.debug(f"{log_prefix}, calling cleanup method") try: node.cleanup(self.graph) except Exception: log.exception( f"An exception occurred when running resource cleanup on {node.rtdname}" )
def collect_team(self, client: StreamingWrapper) -> Optional[Dict]: """Collects an individual team.""" projects = client.list_projects() team_id = str(projects[0]["owner_id"]) team = DigitalOceanTeam(id=team_id, tags={}, urn=f"do:team:{team_id}") try: dopc = DigitalOceanTeamCollector(team, client) dopc.collect() except Exception: log.exception( f"An unhandled error occurred while collecting team {team_id}" ) else: return dopc.graph
def add_edge( self, src: BaseResource, dst: BaseResource, key: EdgeKey = None, edge_type: EdgeType = None, **attr, ): if src is None or dst is None: log.error(f"Not creating edge from or to NoneType: {src} to {dst}") return if edge_type is None: edge_type = EdgeType.default if key is None: key = EdgeKey(src=src, dst=dst, edge_type=edge_type) if self.has_edge(src, dst, key=key): log.error(f"Edge from {src} to {dst} already exists in graph") return return_key = super().add_edge(src, dst, key=key, **attr) if ( self._log_edge_creation and isinstance(src, BaseResource) and isinstance(dst, BaseResource) ): log.debug( f"Added edge from {src.rtdname} to {dst.rtdname} (type: {edge_type.value})" ) try: src.successor_added(dst, self) except Exception: log.exception( ( f"Unhandled exception while telling {src.rtdname}" f" that {dst.rtdname} was added as a successor" ) ) try: dst.predecessor_added(src, self) except Exception: log.exception( ( f"Unhandled exception while telling {dst.rtdname}" f" that {src.rtdname} was added as a predecessor" ) ) return return_key
def get_stats(graph=None) -> Dict: try: stats = { "active_threads": threading.active_count(), "thread_names": [thread.name for thread in threading.enumerate()], "garbage_collector": garbage_collector.get_stats(), "process": get_all_process_info(), } if sys.platform == "linux": stats.update({ "maxrss_parent_bytes": resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024, "maxrss_children_bytes": resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss * 1024, }) else: stats.update({ "maxrss_parent_bytes": 0, "maxrss_children_bytes": 0 }) stats["maxrss_total_bytes"] = (stats["maxrss_parent_bytes"] + stats["maxrss_children_bytes"]) num_fds_parent = (stats["process"].get("parent", {}).get("num_file_descriptors", 0)) num_fds_children = sum([ v["num_file_descriptors"] for v in stats["process"].get("children", {}).values() ]) stats.update({ "maxrss_parent_human_readable": iec_size_format(stats["maxrss_parent_bytes"]), "maxrss_children_human_readable": iec_size_format(stats["maxrss_children_bytes"]), "maxrss_total_human_readable": iec_size_format(stats["maxrss_total_bytes"]), "num_fds_parent": num_fds_parent, "num_fds_children": num_fds_children, "num_fds_total": num_fds_parent + num_fds_children, }) except Exception: log.exception("Error while trying to get stats") return {} else: return stats
def pre_clean(self, node: BaseResource) -> None: if not hasattr(node, "pre_delete"): return log_prefix = f"Resource {node.rtdname} is marked for removal" if ArgumentParser.args.cleanup_dry_run: log.debug( f"{log_prefix}, not calling pre cleanup method because of dry run flag" ) return log.debug(f"{log_prefix}, calling pre cleanup method") try: node.pre_cleanup(self.graph) except Exception: log.exception( ("An exception occurred when running resource pre cleanup on" f" {node.rtdname}"))
def search_first_parent_class(self, node, cls): """Return the first parent node matching a certain class This is being used to search up the graph and e.g. find the account that the graph node is a member of. """ ret = None try: for predecessor_node in list(self.predecessors(node)): if isinstance(predecessor_node, cls): ret = predecessor_node else: ret = self.search_first_parent_class(predecessor_node, cls) if ret: break except RecursionError: log.exception( f"Recursive search error triggered for node {node}'s parent class {cls}" ) return ret
def log_stats(graph=None, garbage_collector_stats: bool = False) -> None: stats = get_stats(graph) try: log.debug( f"Stats: max rss parent: {stats['maxrss_parent_human_readable']}," f" children: {stats['maxrss_children_human_readable']}," f" fds: {stats['num_fds_total']}/" f"{stats['process'].get('parent', {}).get('rlimit_nofile', [0])[0]}" f" active threads {stats['active_threads']}:" f" {', '.join([thread for thread in stats['thread_names']])}") if graph: log.debug(f"Graph Stats: {stats['graph_size_human_readable']}") if garbage_collector_stats: gc_stats = " | ".join([ (f"Gen {i}: collections {data.get('collections')}, " f"collected {data.get('collected')}, " f"uncollectable {data.get('uncollectable')}") for i, data in enumerate(stats["garbage_collector"]) ]) log.debug(f"Garbage Collector Stats: {gc_stats}") except Exception: log.exception("Error while trying to log stats")
def core_actions_processor( collectors: List[BaseCollectorPlugin], message: Dict ) -> None: if not isinstance(message, dict): log.error(f"Invalid message: {message}") return kind = message.get("kind") message_type = message.get("message_type") data = message.get("data") log.debug(f"Received message of kind {kind}, type {message_type}, data: {data}") if kind == "action": try: if message_type == "collect": start_time = time.time() collect_and_send(collectors) run_time = int(time.time() - start_time) log.info(f"Collect ran for {run_time} seconds") elif message_type == "cleanup": start_time = time.time() cleanup() run_time = int(time.time() - start_time) log.info(f"Cleanup ran for {run_time} seconds") else: raise ValueError(f"Unknown message type {message_type}") except Exception as e: log.exception(f"Failed to {message_type}: {e}") reply_kind = "action_error" else: reply_kind = "action_done" reply_message = { "kind": reply_kind, "message_type": message_type, "data": data, } return reply_message
def main() -> None: setup_logger("resotoworker") # Try to run in a new process group and # ignore if not possible for whatever reason try: os.setpgid(0, 0) except Exception: pass resotolib.signal.parent_pid = os.getpid() # Add cli args # The following double parsing of cli args is done so that when # a user specifies e.g. `--collector aws --help` they would # no longer be shown cli args for other collectors like gcp. collector_arg_parser = ArgumentParser( description="resoto worker", env_args_prefix="RESOTOWORKER_", add_help=False, add_machine_help=False, ) PluginLoader.add_args(collector_arg_parser) (args, _) = collector_arg_parser.parse_known_args() ArgumentParser.args = args arg_parser = ArgumentParser( description="resoto worker", env_args_prefix="RESOTOWORKER_", ) jwt_add_args(arg_parser) logging_add_args(arg_parser) graph_add_args(arg_parser) collect_add_args(arg_parser) cleanup_add_args(arg_parser) core_add_args(arg_parser) resotocore_add_args(arg_parser) CoreActions.add_args(arg_parser) WebApp.add_args(arg_parser) PluginLoader.add_args(arg_parser) event_add_args(arg_parser) add_args(arg_parser) # Find resoto Plugins in the resoto.plugins module plugin_loader = PluginLoader() plugin_loader.add_plugin_args(arg_parser) # At this point the CLI, all Plugins as well as the WebServer have # added their args to the arg parser arg_parser.parse_args() # Handle Ctrl+c and other means of termination/shutdown resotolib.signal.initializer() add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False) # Try to increase nofile and nproc limits increase_limits() web_server = WebServer(WebApp()) web_server.daemon = True web_server.start() core_actions = CoreActions( identifier=f"{ArgumentParser.args.resotocore_subscriber_id}-collect_cleanup", resotocore_uri=ArgumentParser.args.resotocore_uri, resotocore_ws_uri=ArgumentParser.args.resotocore_ws_uri, actions={ "collect": { "timeout": ArgumentParser.args.timeout, "wait_for_completion": True, }, "cleanup": { "timeout": ArgumentParser.args.timeout, "wait_for_completion": True, }, }, message_processor=partial( core_actions_processor, plugin_loader.plugins(PluginType.COLLECTOR) ), ) task_queue_filter = {} if ArgumentParser.args.collector and len(ArgumentParser.args.collector) > 0: task_queue_filter = {"cloud": list(ArgumentParser.args.collector)} core_tasks = CoreTasks( identifier="workerd-tasks", resotocore_ws_uri=ArgumentParser.args.resotocore_ws_uri, tasks=["tag"], task_queue_filter=task_queue_filter, message_processor=core_tag_tasks_processor, ) core_actions.start() core_tasks.start() for Plugin in plugin_loader.plugins(PluginType.ACTION): try: log.debug(f"Starting action plugin {Plugin}") plugin = Plugin() plugin.start() except Exception as e: log.exception(f"Caught unhandled persistent Plugin exception {e}") # We wait for the shutdown Event to be set() and then end the program # While doing so we print the list of active threads once per 15 minutes shutdown_event.wait() web_server.shutdown() time.sleep(1) # everything gets 1000ms to shutdown gracefully before we force it resotolib.signal.kill_children(resotolib.signal.SIGTERM, ensure_death=True) log.info("Shutdown complete") os._exit(0)
def main() -> None: setup_logger("resotoshell") shutdown_event = Event() arg_parser = ArgumentParser(description="resoto shell", env_args_prefix="RESOTOSHELL_") add_args(arg_parser) logging_add_args(arg_parser) jwt_add_args(arg_parser) arg_parser.parse_args() headers = {"Accept": "text/plain"} execute_endpoint = f"{ArgumentParser.args.resotocore_uri}/cli/execute" execute_endpoint += f"?resoto_session_id={rnd_str()}" if ArgumentParser.args.resotocore_graph: query_string = urlencode( {"graph": ArgumentParser.args.resotocore_graph}) execute_endpoint += f"&{query_string}" if ArgumentParser.args.resotocore_section: query_string = urlencode( {"section": ArgumentParser.args.resotocore_section}) execute_endpoint += f"&{query_string}" if ArgumentParser.args.stdin: shell = Shell(execute_endpoint, False, "monochrome") log.debug("Reading commands from STDIN") try: for command in sys.stdin.readlines(): command = command.rstrip() shell.handle_command(command, headers) except KeyboardInterrupt: pass except (RuntimeError, ValueError) as e: log.error(e) except Exception: log.exception( "Caught unhandled exception while processing CLI command") finally: shutdown_event.set() else: shell = Shell(execute_endpoint, True, detect_color_system()) completer = None history_file = str(pathlib.Path.home() / ".resotoshell_history") history = FileHistory(history_file) session = PromptSession(history=history) log.debug("Starting interactive session") while not shutdown_event.is_set(): try: command = session.prompt("> ", completer=completer) if command == "": continue if command == "quit": shutdown_event.set() continue shell.handle_command(command, headers) except KeyboardInterrupt: pass except EOFError: shutdown_event.set() except (RuntimeError, ValueError) as e: log.error(e) except Exception: log.exception( "Caught unhandled exception while processing CLI command") sys.exit(0)