Esempio n. 1
0
def core_actions_processor(metrics: Metrics, search_uri: str,
                           tls_data: TLSData, message: dict) -> None:
    if not isinstance(message, dict):
        log.error(f"Invalid message: {message}")
        return
    kind = message.get("kind")
    message_type = message.get("message_type")
    data = message.get("data")
    log.debug(
        f"Received message of kind {kind}, type {message_type}, data: {data}")
    if kind == "action":
        try:
            if message_type == "generate_metrics":
                start_time = time.time()
                update_metrics(metrics, search_uri, tls_data)
                run_time = time.time() - start_time
                log.debug(f"Updated metrics for {run_time:.2f} seconds")
            else:
                raise ValueError(f"Unknown message type {message_type}")
        except Exception as e:
            log.exception(f"Failed to {message_type}: {e}")
            reply_kind = "action_error"
        else:
            reply_kind = "action_done"

        reply_message = {
            "kind": reply_kind,
            "message_type": message_type,
            "data": data,
        }
        return reply_message
Esempio n. 2
0
 def on_message(self, ws, message):
     try:
         message: Dict = json.loads(message)
     except json.JSONDecodeError:
         log.exception(f"Unable to decode received message {message}")
         return
     self.queue.put(message)
Esempio n. 3
0
    def action_processor(self, message: Dict) -> None:
        """Process incoming action messages"""
        if not isinstance(message, dict):
            log.error(f"Invalid message: {message}")
            return
        kind = message.get("kind")
        message_type = message.get("message_type")
        data = message.get("data")
        log.debug(
            f"Received message of kind {kind}, type {message_type}, data: {data}"
        )
        if kind == "action":
            try:
                if message_type == self.action:
                    start_time = time.time()
                    self.do_action(data)
                    run_time = int(time.time() - start_time)
                    log.debug(f"{self.action} ran for {run_time} seconds")
                else:
                    raise ValueError(f"Unknown message type {message_type}")
            except Exception as e:
                log.exception(f"Failed to {message_type}: {e}")
                reply_kind = "action_error"
            else:
                reply_kind = "action_done"

            reply_message = {
                "kind": reply_kind,
                "message_type": message_type,
                "data": data,
            }
            return reply_message
Esempio n. 4
0
 def __delitem__(self, key):
     if self.parent_resource and isinstance(self.parent_resource,
                                            BaseResource):
         log.debug(f"Calling parent resource to delete tag {key} in cloud")
         try:
             if self.parent_resource.delete_tag(key):
                 log_msg = f"Successfully deleted tag {key} in cloud"
                 self.parent_resource._changes.add("tags")
                 self.parent_resource.log(log_msg)
                 log.info((f"{log_msg} for {self.parent_resource.kind}"
                           f" {self.parent_resource.id}"))
                 return super().__delitem__(key)
             else:
                 log_msg = f"Error deleting tag {key} in cloud"
                 self.parent_resource.log(log_msg)
                 log.error((f"{log_msg} for {self.parent_resource.kind}"
                            f" {self.parent_resource.id}"))
         except Exception as e:
             log_msg = f"Unhandled exception while trying to delete tag {key} in cloud:" f" {type(e)} {e}"
             self.parent_resource.log(log_msg, exception=e)
             if self.parent_resource._raise_tags_exceptions:
                 raise
             else:
                 log.exception(log_msg)
     else:
         return super().__delitem__(key)
Esempio n. 5
0
def collect_account(
    account: AWSAccount,
    regions: List,
    args: Namespace = None,
    running_config: RunningConfig = None,
) -> Graph:
    collector_name = f"aws_{account.id}"
    resotolib.proc.set_thread_name(collector_name)

    if args is not None:
        ArgumentParser.args = args
        setup_logger("resotoworker-aws")
    if running_config is not None:
        Config.running_config.apply(running_config)

    log.debug(f"Starting new collect process for account {account.dname}")

    aac = AWSAccountCollector(regions, account)
    try:
        aac.collect()
    except botocore.exceptions.ClientError as e:
        log.exception(f"An AWS {e.response['Error']['Code']} error occurred while collecting account {account.dname}")
        metrics_unhandled_account_exceptions.labels(account=account.dname).inc()
    except Exception:
        log.exception(f"An unhandled error occurred while collecting AWS account {account.dname}")
        metrics_unhandled_account_exceptions.labels(account=account.dname).inc()

    return aac.graph
Esempio n. 6
0
def core_tag_tasks_processor(message: dict) -> None:
    task_id = message.get("task_id")
    # task_name = message.get("task_name")
    # task_attrs = message.get("attrs", {})
    task_data = message.get("data", {})
    delete_tags = task_data.get("delete", [])
    update_tags = task_data.get("update", {})
    node_data = task_data.get("node")
    result = "done"
    extra_data = {}

    try:
        node = node_from_dict(node_data, include_select_ancestors=True)
        for delete_tag in delete_tags:
            del node.tags[delete_tag]

        for k, v in update_tags.items():
            node.tags[k] = v

        node_dict = node_to_dict(node)
        extra_data.update({"data": node_dict})
    except Exception as e:
        log.exception("Error while updating tags")
        result = "error"
        extra_data["error"] = repr(e)

    reply_message = {
        "task_id": task_id,
        "result": result,
    }
    reply_message.update(extra_data)
    return reply_message
Esempio n. 7
0
 def run(self) -> None:
     try:
         self.go()
     except Exception:
         metrics_unhandled_plugin_exceptions.labels(plugin=self.name).inc()
         log.exception(f"Caught unhandled plugin exception in {self.name}")
     else:
         self.finished = True
Esempio n. 8
0
def dispatch_event(event: Event, blocking: bool = False) -> None:
    """Dispatch an Event"""
    waiting_str = "" if blocking else "not "
    log.debug(
        f"Dispatching event {event.event_type.name} and {waiting_str}waiting for"
        " listeners to return")

    if event.event_type not in _events.keys():
        return

    with _events_lock.read_access:
        # Event listeners might unregister themselves during event dispatch
        # so we will work on a shallow copy while processing the current event.
        listeners = dict(_events[event.event_type])

    threads = {}
    for listener, listener_data in listeners.items():
        try:
            if listener_data["pid"] != os.getpid():
                continue

            if listener_data["one-shot"] and not listener_data["lock"].acquire(
                    blocking=False):
                log.error(f"Not calling one-shot listener {listener} of type"
                          f" {type(listener)} - can't acquire lock")
                continue

            log.debug(f"Calling listener {listener} of type {type(listener)}"
                      f" (blocking: {listener_data['blocking']})")
            thread_name = f"{event.event_type.name.lower()}_event" f"-{getattr(listener, '__name__', 'anonymous')}"
            t = Thread(target=listener, args=[event], name=thread_name)
            if blocking or listener_data["blocking"]:
                threads[t] = listener
            t.start()
        except Exception:
            log.exception("Caught unhandled event callback exception")
        finally:
            if listener_data["one-shot"]:
                log.debug(
                    f"One-shot specified for event {event.event_type.name} "
                    f"listener {listener} - removing event listener")
                remove_event_listener(event.event_type, listener)
                listener_data["lock"].release()

    start_time = time.time()
    for thread, listener in threads.items():
        timeout = start_time + listeners[listener]["timeout"] - time.time()
        if timeout < 1:
            timeout = 1
        log.debug(
            f"Waiting up to {timeout:.2f}s for event listener {thread.name} to finish"
        )
        thread.join(timeout)
        log.debug(
            f"Event listener {thread.name} finished (timeout: {thread.is_alive()})"
        )
Esempio n. 9
0
 def on_config_event(self, message: Dict[str, Any]) -> None:
     if (message.get("message_type") == "config-updated"
             and message.get("data", {}).get("id") == self.config_name
             and message.get("data", {}).get("revision") !=
             Config.running_config.revision):
         try:
             log.debug(f"Config {self.config_name} has changed - reloading")
             self.load_config(reload=True)
         except Exception:
             log.exception("Failed to reload config")
Esempio n. 10
0
    def pre_cleanup(self, graph=None) -> bool:
        if not hasattr(self, "pre_delete"):
            return True

        if graph is None:
            graph = self._graph

        if self.phantom:
            raise RuntimeError(
                f"Can't cleanup phantom resource {self.rtdname}")

        if self.cleaned:
            log.debug(f"Resource {self.rtdname} has already been cleaned up")
            return True

        account = self.account(graph)
        region = self.region(graph)
        if not isinstance(account, BaseAccount) or not isinstance(
                region, BaseRegion):
            log.error(
                ("Could not determine account or region for pre cleanup of"
                 f" {self.rtdname}"))
            return False

        log_suffix = f" in account {account.dname} region {region.name}"
        self.log("Trying to run pre clean up")
        log.debug(f"Trying to run pre clean up {self.rtdname}{log_suffix}")
        try:
            if not getattr(self, "pre_delete")(graph):
                self.log("Failed to run pre clean up")
                log.error(
                    f"Failed to run pre clean up {self.rtdname}{log_suffix}")
                return False
            self.log("Successfully ran pre clean up")
            log.info(
                f"Successfully ran pre clean up {self.rtdname}{log_suffix}")
        except Exception as e:
            self.log("An error occurred during pre clean up", exception=e)
            log.exception(
                f"An error occurred during pre clean up {self.rtdname}{log_suffix}"
            )
            cloud = self.cloud(graph)
            metrics_resource_pre_cleanup_exceptions.labels(
                cloud=cloud.name,
                account=account.dname,
                region=region.name,
                kind=self.kind,
            ).inc()
            return False
        return True
Esempio n. 11
0
 def on_message(self, ws, message):
     try:
         message: Dict = json.loads(message)
     except json.JSONDecodeError:
         log.exception(f"Unable to decode received message {message}")
         return
     log.debug(f"Received event: {message}")
     if self.message_processor is not None and callable(
             self.message_processor):
         try:
             self.message_processor(message)
         except Exception:
             log.exception(
                 f"Something went wrong while processing {message}")
Esempio n. 12
0
 def worker(self) -> None:
     while not self.shutdown_event.is_set():
         message = self.queue.get()
         log.debug(f"{self.identifier} received: {message}")
         if self.message_processor is not None and callable(
                 self.message_processor):
             try:
                 result = self.message_processor(message)
                 log.debug(f"Sending reply {result}")
                 self.ws.send(json.dumps(result))
             except Exception:
                 log.exception(
                     f"Something went wrong while processing {message}")
         self.queue.task_done()
Esempio n. 13
0
    def collect_team(self, client: StreamingWrapper) -> Optional[Graph]:
        """Collects an individual team."""
        team_id = client.get_team_id()
        team = DigitalOceanTeam(id=team_id, tags={}, urn=f"do:team:{team_id}")

        try:
            dopc = DigitalOceanTeamCollector(team, client)
            dopc.collect()
        except Exception:
            log.exception(
                f"An unhandled error occurred while collecting team {team_id}")
            return None
        else:
            return dopc.graph
Esempio n. 14
0
 def run(self) -> None:
     try:
         # ArgumentParser.args = self._args
         # resotolib.config._config = self._config
         # setup_logger("resotoworker")
         # resotolib.proc.initializer()
         current_thread().name = self.name
         if self.bootstrap():
             self.go()
     except Exception:
         metrics_unhandled_plugin_exceptions.labels(plugin=self.name).inc()
         log.exception(f"Caught unhandled plugin exception in {self.name}")
     else:
         self.finished = True
Esempio n. 15
0
    def clean(self, node: BaseResource) -> None:
        log_prefix = f"Resource {node.rtdname} is marked for removal"
        if Config.resotoworker.cleanup_dry_run:
            log.info(
                f"{log_prefix}, not calling cleanup method because of dry run flag"
            )
            return

        log.info(f"{log_prefix}, calling cleanup method")
        try:
            node.cleanup(self.graph)
        except Exception:
            log.exception(
                f"An exception occurred when running resource cleanup on {node.rtdname}"
            )
Esempio n. 16
0
    def pre_clean(self, node: BaseResource) -> None:
        if not hasattr(node, "pre_delete"):
            return

        log_prefix = f"Resource {node.rtdname} is marked for removal"
        if Config.resotoworker.cleanup_dry_run:
            log.info(
                f"{log_prefix}, not calling pre cleanup method because of dry run flag"
            )
            return

        log.info(f"{log_prefix}, calling pre cleanup method")
        try:
            node.pre_cleanup(self.graph)
        except Exception:
            log.exception(
                ("An exception occurred when running resource pre cleanup on"
                 f" {node.rtdname}"))
Esempio n. 17
0
def core_actions_processor(plugin_loader: PluginLoader, tls_data: TLSData, collector: Collector, message: Dict) -> None:
    collectors: List[BaseCollectorPlugin] = plugin_loader.plugins(PluginType.COLLECTOR)
    if not isinstance(message, dict):
        log.error(f"Invalid message: {message}")
        return
    kind = message.get("kind")
    message_type = message.get("message_type")
    data = message.get("data")
    task_id = data.get("task")
    log.debug(f"Received message of kind {kind}, type {message_type}, data: {data}")
    if kind == "action":
        try:
            if message_type == "collect":
                start_time = time.time()
                collector.collect_and_send(collectors, task_id=task_id)
                run_time = int(time.time() - start_time)
                log.info(f"Collect ran for {run_time} seconds")
            elif message_type == "cleanup":
                if not Config.resotoworker.cleanup:
                    log.info("Cleanup called but disabled in config" " (resotoworker.cleanup) - skipping")
                else:
                    if Config.resotoworker.cleanup_dry_run:
                        log.info("Cleanup called with dry run configured" " (resotoworker.cleanup_dry_run)")
                    start_time = time.time()
                    cleanup(tls_data=tls_data)
                    run_time = int(time.time() - start_time)
                    log.info(f"Cleanup ran for {run_time} seconds")
            else:
                raise ValueError(f"Unknown message type {message_type}")
        except Exception as e:
            log.exception(f"Failed to {message_type}: {e}")
            reply_kind = "action_error"
        else:
            reply_kind = "action_done"

        reply_message = {
            "kind": reply_kind,
            "message_type": message_type,
            "data": data,
        }
        return reply_message
Esempio n. 18
0
    def collect_project(
        project_id: str,
        args: Namespace = None,
        running_config: RunningConfig = None,
        credentials=None,
    ) -> Optional[Dict]:
        """Collects an individual project.

        Is being called in collect() and either run within a thread or a spawned
        process. Depending on whether `gcp.fork_process` was specified or not.

        Because the spawned process does not inherit any of our memory or file
        descriptors we are passing the already parsed `args` Namespace() to this
        method.
        """
        project = GCPProject(project_id, {})
        collector_name = f"gcp_{project.id}"
        resotolib.proc.set_thread_name(collector_name)

        if args is not None:
            ArgumentParser.args = args
            setup_logger("resotoworker-gcp")
        if running_config is not None:
            Config.running_config.apply(running_config)

        if credentials is not None:
            Credentials._credentials = credentials
            Credentials._initialized = True

        log.debug(f"Starting new collect process for project {project.dname}")

        try:
            gpc = GCPProjectCollector(project)
            gpc.collect()
        except Exception:
            log.exception(
                f"An unhandled error occurred while collecting {project.rtdname}"
            )
        else:
            return gpc.graph
Esempio n. 19
0
def restart() -> None:
    python_args = []
    if not getattr(sys, "frozen", False):
        python_args = subprocess._args_from_interpreter_flags()
    args = python_args + sys.argv

    path_prefix = "." + os.pathsep
    python_path = os.environ.get("PYTHONPATH", "")
    if sys.path[0] == "" and not python_path.startswith(path_prefix):
        os.environ["PYTHONPATH"] = path_prefix + python_path

    try:
        close_fds()
    except Exception:
        log.exception("Failed to FD_CLOEXEC all file descriptors")

    kill_children(SIGTERM, ensure_death=True)

    os.chdir(initial_dir)
    os.execv(sys.executable, [sys.executable] + args)
    log.fatal("Failed to restart - exiting")
    os._exit(1)
Esempio n. 20
0
def main() -> None:
    setup_logger("resotoworker")
    # Try to run in a new process group and
    # ignore if not possible for whatever reason
    try:
        os.setpgid(0, 0)
    except Exception:
        pass

    resotolib.proc.parent_pid = os.getpid()

    arg_parser = ArgumentParser(
        description="resoto worker",
        env_args_prefix="RESOTOWORKER_",
    )
    add_args(arg_parser)
    jwt_add_args(arg_parser)
    logging_add_args(arg_parser)
    core_add_args(arg_parser)
    Config.add_args(arg_parser)
    TLSData.add_args(arg_parser)

    # Find resoto Plugins in the resoto.plugins module
    plugin_loader = PluginLoader()
    plugin_loader.add_plugin_args(arg_parser)

    # At this point the CLI, all Plugins as well as the WebServer have
    # added their args to the arg parser
    arg_parser.parse_args()

    try:
        wait_for_resotocore(resotocore.http_uri)
    except TimeoutError as e:
        log.fatal(f"Failed to connect to resotocore: {e}")
        sys.exit(1)

    tls_data = None
    if resotocore.is_secure:
        tls_data = TLSData(
            common_name=ArgumentParser.args.subscriber_id,
            resotocore_uri=resotocore.http_uri,
        )
        tls_data.start()
    config = Config(
        ArgumentParser.args.subscriber_id,
        resotocore_uri=resotocore.http_uri,
        tls_data=tls_data,
    )
    add_config(config)
    plugin_loader.add_plugin_config(config)
    config.load_config()

    def send_request(request: requests.Request) -> requests.Response:
        prepared = request.prepare()
        s = requests.Session()
        verify = None
        if tls_data:
            verify = tls_data.verify
        return s.send(request=prepared, verify=verify)

    core = Resotocore(send_request, config)

    collector = Collector(core.send_to_resotocore, config)

    # Handle Ctrl+c and other means of termination/shutdown
    resotolib.proc.initializer()
    add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False)

    # Try to increase nofile and nproc limits
    increase_limits()

    web_server_args = {}
    if tls_data:
        web_server_args = {
            "ssl_cert": tls_data.cert_path,
            "ssl_key": tls_data.key_path,
        }
    web_server = WebServer(
        WebApp(mountpoint=Config.resotoworker.web_path),
        web_host=Config.resotoworker.web_host,
        web_port=Config.resotoworker.web_port,
        **web_server_args,
    )
    web_server.daemon = True
    web_server.start()

    core_actions = CoreActions(
        identifier=f"{ArgumentParser.args.subscriber_id}-collector",
        resotocore_uri=resotocore.http_uri,
        resotocore_ws_uri=resotocore.ws_uri,
        actions={
            "collect": {
                "timeout": Config.resotoworker.timeout,
                "wait_for_completion": True,
            },
            "cleanup": {
                "timeout": Config.resotoworker.timeout,
                "wait_for_completion": True,
            },
        },
        message_processor=partial(core_actions_processor, plugin_loader, tls_data, collector),
        tls_data=tls_data,
    )

    task_queue_filter = {}
    if len(Config.resotoworker.collector) > 0:
        task_queue_filter = {"cloud": list(Config.resotoworker.collector)}
    core_tasks = CoreTasks(
        identifier=f"{ArgumentParser.args.subscriber_id}-tagger",
        resotocore_ws_uri=resotocore.ws_uri,
        tasks=["tag"],
        task_queue_filter=task_queue_filter,
        message_processor=core_tag_tasks_processor,
        tls_data=tls_data,
    )
    core_actions.start()
    core_tasks.start()

    for Plugin in plugin_loader.plugins(PluginType.ACTION):
        try:
            log.debug(f"Starting action plugin {Plugin}")
            plugin = Plugin(tls_data=tls_data)
            plugin.start()
        except Exception as e:
            log.exception(f"Caught unhandled persistent Plugin exception {e}")

    # We wait for the shutdown Event to be set() and then end the program
    # While doing so we print the list of active threads once per 15 minutes
    shutdown_event.wait()
    web_server.shutdown()
    time.sleep(1)  # everything gets 1000ms to shutdown gracefully before we force it
    resotolib.proc.kill_children(resotolib.proc.SIGTERM, ensure_death=True)
    log.info("Shutdown complete")
    os._exit(0)
Esempio n. 21
0
    def override_config(running_config: RunningConfig) -> None:
        if getattr(ArgumentParser.args, "config_override", None) is None:
            return
        for override in getattr(ArgumentParser.args, "config_override", []):
            try:
                if "=" not in override:
                    log.error(f"Invalid config override {override}")
                    continue
                config_key, config_value = override.split("=", 1)
                if "." not in config_key:
                    log.error(f"Invalid config override {config_key}")
                    continue

                config_keys = config_key.split(".")
                num_keys = len(config_keys)
                config_part = running_config.data
                set_value = False

                # By default we cast the override value to the type of the current
                # value. This works for most cases including dictionary values.
                # Should the current value be None we see if there was a type specified
                # for the dataclass field and use it as a fallback.
                # This only works for dataclass fields.
                config_section = config_keys[0]
                top_config_key = config_keys[1]
                fallback_target_type = None
                if (config_section in Config.running_config.types
                        and top_config_key
                        in Config.running_config.types[config_section]):
                    fallback_target_type = Config.running_config.types[
                        config_section][top_config_key]

                for num_key, key in enumerate(config_keys):
                    if num_key == num_keys - 1:
                        set_value = True
                        log.debug(f"Overriding config key {config_key}")

                    if hasattr(config_part, key):
                        attr_value = getattr(config_part, key)
                        if set_value:
                            config_value = Config.cast_target_type(
                                config_value, attr_value, fallback_target_type)
                            setattr(config_part, key, config_value)
                        else:
                            config_part = attr_value
                    elif isinstance(config_part, dict) and key in config_part:
                        attr_value = config_part[key]
                        if set_value:
                            config_value = Config.cast_target_type(
                                config_value, attr_value, fallback_target_type)
                            config_part[key] = config_value
                        else:
                            config_part = attr_value
                    else:
                        log.error(
                            f"Override key {config_key} is unknown - skipping")
                        break

                target_type = str
                if target_type in (list, tuple, set):
                    config_value = target_type(config_value.split(","))
                config_value = convert(config_value, target_type)

            except Exception:
                log.exception(f"Failed to override config {override}")