class SppMon:
    """Main-File for the sppmon. Only general functions here and calls for sub-modules.

    Attributes:
        job_log_retention_time - Configured spp log rentation time.
        minimum_timeout - increased timeout on loaded systems.
        minLogs_page_size - reduced pagesize on loaded systems.
        minLogs_joblog_type - reduced types to be requested on loaded systems.
        log_path - path to logger, set in set_logger.

    Methods:
        set_logger - Sets global logger for stdout and file logging.
        set_critial_configs - Sets up any critical infrastructure.
        set_optional_configs - Sets up any optional infrastructure.
        store_script_metrics - Stores script metrics into influxb.
        exit - Executes finishing tasks and exits sppmon.

    """

    # set class variables
    MethodUtils.verbose = OPTIONS.verbose
    SppUtils.verbose = OPTIONS.verbose

    # ## API-REST Page settings ## #
    timeout_reduction = 0.9
    """How much % the pagesize is reduced after a timeout"""
    allowed_time_diff_quota = 0.1
    """% allowed to differ before adjustments are made"""
    maximum_increase_pagesize = 3.5
    """maximum factor for the pagesize to be increased in one go"""
    page_size = 50
    """ the starting page size, adjusted later on within rest_client"""
    min_page_size = 1
    """minimum size of a rest-api page"""
    send_retries = 3
    """How much retries are made before failing, last one is on min-size"""

    # minimum settings
    loaded_preferred_time = 40
    """perfect query send time in seconds for loaded systems"""
    minimum_timeout = 90
    """increased timeout on loaded systems."""
    minLogs_joblog_type = '["SUMMARY"]'
    """reduced types to be requested on loaded systems."""

    # default settings
    default_joblog_type = '["INFO","DEBUG","ERROR","SUMMARY","WARN"]'
    """regular joblog query types on normal running systems"""
    default_timeout = 60
    """regular timeout on normal running systems"""
    preferred_time = 30
    """perfect query send time in seconds"""

    # set later in each method, here to avoid missing attribute
    influx_client = None
    rest_client = None
    api_queries = None
    system_methods = None
    job_methods = None
    hypervisor_methods = None
    ssh_methods = None

    def __init__(self):
        self.log_path: str = ""
        """path to logger, set in set_logger."""
        self.pid_file_path: str = ""
        """path to pid_file, set in check_pid_file."""

        # String, cause of days etc
        self.job_log_retention_time = "60d"
        """Configured spp log rentation time, logs get deleted after this time."""

        self.set_logger()

        if (not self.check_pid_file()):
            ExceptionUtils.error_message(
                "Another instance of sppmon with the same args is running")
            self.exit(ERROR_CODE_CMD_LINE)

        # everything is option, otherwise its a typo.
        if (len(ARGS) > 0):
            ExceptionUtils.error_message(
                f"CAREFUL: ARG DETECTED, probably typing in programm call: {ARGS}"
            )

        time_stamp_name, time_stamp = SppUtils.get_capture_timestamp_sec()
        self.start_counter = time.perf_counter()
        LOGGER.debug("\n\n")
        LOGGER.debug(f"running script version: {VERSION}")
        LOGGER.debug(f"cmdline options: {OPTIONS}")
        LOGGER.debug(f"{time_stamp_name}: {time_stamp}")
        LOGGER.debug("")

        if (not OPTIONS.confFileJSON):
            ExceptionUtils.error_message("missing config file, aborting")
            self.exit(error_code=ERROR_CODE_CMD_LINE)
        try:
            config_file = SppUtils.read_conf_file(
                config_file_path=OPTIONS.confFileJSON)
        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="Syntax Error in Config file, unable to read")
            self.exit(error_code=ERROR_CODE_CMD_LINE)

        self.setup_args()
        self.set_critial_configs(config_file)
        self.set_optional_configs(config_file)

    def set_logger(self) -> None:
        """Sets global logger for stdout and file logging.

        Changes logger aquired by LOGGER_NAME.

        Raises:
            ValueError: Unable to open logger

        """
        self.log_path = SppUtils.filename_of_config(OPTIONS.confFileJSON,
                                                    ".log")

        try:
            file_handler = logging.FileHandler(self.log_path)
        except Exception as error:
            # TODO here: Right exception, how to print this error?
            print("unable to open logger")
            raise ValueError("Unable to open Logger") from error

        file_handler_fmt = logging.Formatter(
            '%(asctime)s:[PID %(process)d]:%(levelname)s:%(module)s.%(funcName)s> %(message)s'
        )
        file_handler.setFormatter(file_handler_fmt)
        if (OPTIONS.debug):
            file_handler.setLevel(logging.DEBUG)
        else:
            file_handler.setLevel(logging.ERROR)

        stream_handler = logging.StreamHandler()
        stream_handler.setLevel(logging.INFO)

        logger = logging.getLogger(LOGGER_NAME)
        logger.setLevel(logging.DEBUG)
        logger.addHandler(file_handler)
        logger.addHandler(stream_handler)

    def check_pid_file(self) -> bool:

        self.pid_file_path = SppUtils.filename_of_config(
            OPTIONS.confFileJSON, ".pid_file")
        try:
            try:
                file = open(self.pid_file_path, "rt")
                match_list = re.findall(r"(\d+) " + str(OPTIONS), file.read())
                file.close()
                deleted_processes: List[str] = []
                for match in match_list:
                    # add spaces to make clear the whole number is matched
                    match = f' {match} '
                    try:
                        if (os.name == 'nt'):
                            args = ['ps', '-W']
                        else:
                            args = ['ps', '-p', match]
                        result = subprocess.run(args,
                                                check=True,
                                                capture_output=True)
                        if (re.search(match, str(result.stdout))):
                            return False
                        # not in there -> delete entry
                        deleted_processes.append(match)
                    except CalledProcessError as error:
                        deleted_processes.append(match)

                # delete processes which did get killed, not often called
                if (deleted_processes):
                    file = open(self.pid_file_path, "rt")
                    file_str = file.read()
                    file.close()
                    options = str(OPTIONS)
                    for pid in deleted_processes:
                        file_str = file_str.replace(f"{pid} {options}", "")
                    # do not delete if empty since we will use it below
                    file = open(self.pid_file_path, "wt")
                    file.write(file_str.strip())
                    file.close()

            except FileNotFoundError:
                pass  # no file created yet

            # always write your own pid into it
            file = open(self.pid_file_path, "at")
            file.write(f"{os.getpid()} {str(OPTIONS)}")
            file.close()
            return True
        except Exception as error:
            ExceptionUtils.exception_info(error)
            raise ValueError("Error when checking pid file")

    def remove_pid_file(self) -> None:
        try:
            file = open(self.pid_file_path, "rt")
            file_str = file.read()
            file.close()
            new_file_str = file_str.replace(f"{os.getpid()} {str(OPTIONS)}",
                                            "").strip()
            if (not new_file_str.strip()):
                os.remove(self.pid_file_path)
            else:
                file = open(self.pid_file_path, "wt")
                file.write(new_file_str)
                file.close()
        except Exception as error:
            ExceptionUtils.exception_info(error,
                                          "Error when removing pid_file")

    def set_critial_configs(self, config_file: Dict[str, Any]) -> None:
        """Sets up any critical infrastructure, to be called within the init.

        Be aware not everything may be initalized on call time.
        Add config here if the system should abort if it is missing.

        Arguments:
            config_file {Dict[str, Any]} -- Opened Config file
        """
        if (not config_file):
            ExceptionUtils.error_message(
                "missing or empty config file, aborting")
            self.exit(error_code=ERROR_CODE_CMD_LINE)
        try:
            # critical components only

            auth_influx = SppUtils.get_cfg_params(param_dict=config_file,
                                                  param_name="influxDB")
            if (not isinstance(auth_influx, dict)):
                raise ValueError("influx config need to be dict")
            self.influx_client = InfluxClient(auth_influx=auth_influx)
            self.influx_client.connect()

        except ValueError as err:
            ExceptionUtils.exception_info(
                error=err,
                extra_message="error while setting up critical config. Aborting"
            )
            self.influx_client = None  # set none cause it does not work.
            self.exit(error_code=ERROR_CODE)

    def set_optional_configs(self, config_file: Dict[str, Any]) -> None:
        """Sets up any optional infrastructure, to be called within the init.

        Be aware not everything may be initalized on call time.
        Add config here if the system should not abort if it is missing.

        Arguments:
            config_file {Dict[str, Any]} -- Opened Config file
        """

        if (not config_file):
            ExceptionUtils.error_message(
                "missing or empty config file, aborting.")
            self.exit(error_code=ERROR_CODE_CMD_LINE)

        # ############################ REST-API #####################################
        try:
            auth_rest = SppUtils.get_cfg_params(param_dict=config_file,
                                                param_name="sppServer")

            if (not isinstance(auth_rest, dict)):
                raise ValueError("sppServer config need to be dict")

            self.job_log_retention_time = auth_rest.get(
                "jobLog_rentation", "60d")

            ConnectionUtils.verbose = OPTIONS.verbose
            ConnectionUtils.timeout_reduction = self.timeout_reduction
            ConnectionUtils.allowed_time_diff_quota = self.allowed_time_diff_quota
            ConnectionUtils.maximum_increase_pagesize = self.maximum_increase_pagesize

            if (OPTIONS.minimumLogs):
                rest_time_out = self.minimum_timeout
                rest_preferred_time = self.loaded_preferred_time
            else:
                rest_time_out = self.default_timeout
                rest_preferred_time = self.preferred_time

            self.rest_client = RestClient(auth_rest, rest_time_out,
                                          rest_preferred_time, self.page_size,
                                          self.min_page_size,
                                          self.send_retries, OPTIONS.verbose)

            self.api_queries = ApiQueries(self.rest_client)
            self.rest_client.login()

        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="REST-API is not available due Config error")
            self.rest_client = None
            self.api_queries = None

        # ######################## System, Job and Hypervisor Methods ##################
        try:
            # explicit ahead due dependency
            self.system_methods = SystemMethods(self.influx_client,
                                                self.api_queries,
                                                OPTIONS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        try:
            self.job_methods = JobMethods(self.influx_client, self.api_queries,
                                          self.job_log_retention_time,
                                          self.minLogs_joblog_type,
                                          self.default_joblog_type,
                                          OPTIONS.verbose, OPTIONS.minimumLogs)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        try:
            # dependen on system methods
            self.hypervisor_methods = ProtectionMethods(
                self.system_methods, self.influx_client, self.api_queries,
                OPTIONS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        # ############################### SSH #####################################
        if (self.ssh or self.process_stats):
            try:

                auth_ssh = SppUtils.get_cfg_params(param_dict=config_file,
                                                   param_name="sshclients")

                ssh_clients: List[SshClient] = []
                if (not isinstance(auth_ssh, list)):
                    raise ValueError("not a list of sshconfig given", auth_ssh)

                for client_ssh in auth_ssh:
                    try:
                        ssh_clients.append(SshClient(client_ssh))
                    except ValueError as error:
                        ExceptionUtils.exception_info(
                            error=error,
                            extra_message=
                            f"Setting up one client failed, skipping it. Client: \
                            {client_ssh.get('name', 'ERROR WHEN GETTING NAME')}"
                        )

                # set from None to methods once finished
                self.ssh_methods = SshMethods(influx_client=self.influx_client,
                                              ssh_clients=ssh_clients,
                                              verbose=OPTIONS.verbose)

            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "SSH-Commands are not available due Config error")

    def setup_args(self) -> None:
        """This method set up all required parameters and transforms arg groups into individual args.
        """
        # ## call functions based on cmdline parameters

        # incremental setup, higher executes all below
        all_args: bool = OPTIONS.all
        daily: bool = OPTIONS.daily or all_args
        hourly: bool = OPTIONS.hourly or daily
        constant: bool = OPTIONS.constant or hourly

        # ######## All Methods #################

        self.sites: bool = OPTIONS.sites or all_args

        # ######## Daily Methods ###############

        self.vms: bool = OPTIONS.vms or daily
        self.job_logs: bool = OPTIONS.jobLogs or daily
        self.sla_stats: bool = OPTIONS.slaStats or daily
        self.vm_stats: bool = OPTIONS.vmStats or daily

        # ######## Hourly Methods ##############

        self.jobs: bool = OPTIONS.jobs or hourly
        self.vadps: bool = OPTIONS.vadps or hourly
        self.storages: bool = OPTIONS.storages or hourly
        # ssh vsnap pools ?

        # ######## Constant Methods ############

        self.ssh: bool = OPTIONS.ssh or constant
        self.process_stats: bool = OPTIONS.processStats or constant
        self.cpu: bool = OPTIONS.cpu or constant
        self.spp_catalog: bool = OPTIONS.sppcatalog or constant

    def store_script_metrics(self) -> None:
        """Stores script metrics into influxb. To be called before exit.

        Does not raise any exceptions, skips if influxdb is missing.
        """
        LOGGER.info("Storing script metrics")
        try:
            if (not self.influx_client):
                raise ValueError("no influxClient set up")
            insert_dict: Dict[str, Union[str, int, float, bool]] = {}

            # add version nr, api calls are needed
            insert_dict["sppmon_version"] = VERSION
            if (self.rest_client):
                try:
                    (version_nr,
                     build) = self.rest_client.get_spp_version_build()
                    insert_dict["spp_version"] = version_nr
                    insert_dict["spp_build"] = build
                except ValueError as error:
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message="could not query SPP version and build.")

            # end total sppmon runtime
            end_counter = time.perf_counter()
            insert_dict['duration'] = int(
                (end_counter - self.start_counter) * 1000)

            # add arguments of sppmon
            for (key, value) in vars(OPTIONS).items():
                insert_dict[key] = value

            # save occured errors
            error_count = len(ExceptionUtils.stored_errors)
            if (error_count > 0):
                ExceptionUtils.error_message(
                    f"total of {error_count} exception/s occured")
            insert_dict['errorCount'] = error_count
            # save list as str
            insert_dict['errorMessages'] = str(ExceptionUtils.stored_errors)

            # get end timestamp
            (time_key, time_val) = SppUtils.get_capture_timestamp_sec()
            insert_dict[time_key] = time_val

            # save the metrics
            self.influx_client.insert_dicts_to_buffer(
                table_name="sppmon_metrics", list_with_dicts=[insert_dict])
            self.influx_client.flush_insert_buffer()
            LOGGER.info("Stored script metrics sucessfull")
            # + 1 due the "total of x exception/s occured"
            if (error_count + 1 < len(ExceptionUtils.stored_errors)):
                ExceptionUtils.error_message(
                    "A non-critical error occured while storing script metrics. \n\
                    This error can't be saved into the DB, it's only displayed within the logs."
                )
        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message=
                "Error when storing sppmon-metrics, skipping this step. Possible insert-buffer data loss"
            )

    def exit(self, error_code: int = False) -> NoReturn:
        """Executes finishing tasks and exits sppmon. To be called every time.

        Executes finishing tasks and displays error messages.
        Specify only error message if something did went wrong.
        Use Error codes specified at top of module.
        Does NOT return.

        Keyword Arguments:
            error {int} -- Errorcode if a error occured. (default: {False})
        """

        # error with the command line arguments
        # dont store runtime here
        if (error_code == ERROR_CODE_CMD_LINE):
            prog_args = []
            prog_args.append(sys.argv[0])
            prog_args.append("--help")
            os.execv(sys.executable, ['python'] + prog_args)
            sys.exit(ERROR_CODE_CMD_LINE)  # unreachable?

        script_end_time = SppUtils.get_actual_time_sec()
        LOGGER.debug("Script end time: %d", script_end_time)

        try:
            self.store_script_metrics()

            if (self.influx_client):
                self.influx_client.disconnect()
            if (self.rest_client):
                self.rest_client.logout()

        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="Error occured while exiting sppmon")
            error_code = ERROR_CODE

        if (not error_code):
            LOGGER.info("\n\n!!! script completed !!!\n")

        self.remove_pid_file()

        # Both clauses are actually the same, but for clarification, always last due always beeing true for any number
        if (error_code == ERROR_CODE or error_code):
            ExceptionUtils.error_message(
                "Error occured while executing sppmon")

        print(
            f"check log for details: grep \"PID {os.getpid()}\" {self.log_path} > sppmon.log.{os.getpid()}"
        )
        sys.exit(error_code)

    def main(self):

        if (not self.influx_client):
            ExceptionUtils.error_message(
                "somehow no influx client is present even after init")
            self.exit(ERROR_CODE)

        # ##################### SYSTEM METHODS #######################
        if (self.sites and self.system_methods):
            try:
                self.system_methods.sites()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting sites, skipping them all")

        if (self.cpu and self.system_methods):
            try:
                self.system_methods.cpuram()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting cpu stats, skipping them all"
                )

        if (self.spp_catalog and self.system_methods):
            try:
                self.system_methods.sppcatalog()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting file system stats, skipping them all"
                )

        # ####################### JOB METHODS ########################
        if (self.jobs and self.job_methods):
            # store all jobs grouped by jobID
            try:
                self.job_methods.get_all_jobs()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting jobs, skipping them all")

        if (self.job_logs and self.job_methods):
            # store all job logs per job session instance
            try:
                self.job_methods.job_logs()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting job logs, skipping them all"
                )

        # ####################### SSH METHODS ########################
        if (self.ssh and self.ssh_methods):
            # execute ssh statements for, VSNAP, VADP, other ssh hosts
            # store all job logs per job session instance
            try:
                self.ssh_methods.ssh()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when excecuting ssh commands, skipping them all"
                )

        if (self.process_stats and self.ssh_methods):
            # execute process stats for server
            try:
                self.ssh_methods.process_stats()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when excecuting ssh process statistic commands, skipping them all"
                )

        # ################### HYPERVISOR METHODS #####################
        if (self.vms and self.hypervisor_methods):
            try:
                self.hypervisor_methods.store_vms()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting all VMs, skipping them all"
                )

        if (self.sla_stats and self.hypervisor_methods):
            # number of VMs per SLA and sla dumps
            try:
                self.hypervisor_methods.vms_per_sla()
                self.hypervisor_methods.sla_dumps()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting and computing VMs per sla, skipping them all"
                )

        if (self.vm_stats and self.hypervisor_methods):
            # retrieve and calculate VM inventory summary
            try:
                self.hypervisor_methods.create_inventory_summary()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when creating inventory summary, skipping them all"
                )

        if (self.vadps and self.hypervisor_methods):
            try:
                self.hypervisor_methods.vadps()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting vadps, skipping them all")

        if (self.storages and self.hypervisor_methods):
            try:
                self.hypervisor_methods.storages()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting storages, skipping them all"
                )

        # ###################### OTHER METHODS #######################

        if (OPTIONS.create_dashboard):
            try:
                if (not self.influx_client):
                    raise ValueError(
                        "need the influxclient to create the dashboard")
                OtherMethods.create_dashboard(
                    dashboard_folder_path=OPTIONS.dashboard_folder_path,
                    database_name=self.influx_client.database.name)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when creating dashboards")

        # ######################   DISCLAMER   #######################
        # ###################  TEMPORARY FEATURE  ####################
        # this part is deleted once all old versions of SPPMon have been migrated
        # use at own caution
        # ############################################################
        if (OPTIONS.transfer_data):
            try:
                self.influx_client.transfer_data(OPTIONS.old_database)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when transfering data storages.")

        self.exit()
Ejemplo n.º 2
0
class SppMon:
    """Main-File for the sppmon. Only general functions here and calls for sub-modules.

    Attributes:
        log_path - path to logger, set in set_logger
        pid_file_path - path to pid_file, set in check_pid_file
        config_file
        See below for full list

    Methods:
        set_logger - Sets global logger for stdout and file logging.
        set_critial_configs - Sets up any critical infrastructure.
        set_optional_configs - Sets up any optional infrastructure.
        store_script_metrics - Stores script metrics into influxb.
        exit - Executes finishing tasks and exits sppmon.

    """

    # set class variables
    MethodUtils.verbose = ARGS.verbose
    SppUtils.verbose = ARGS.verbose

    # ###### API-REST page settings  ###### #
    # ## IMPORTANT NOTES ## #
    # please read the documentation before adjusting values.
    # if unsure contact the sppmon develop team before adjusting

    # ## Recommend changes for loaded systems ##

    # Use --loadedSystem if sppmon causes big CPU spikes on your SPP-Server
    # CAUTION: using --loadedSystem causes some data to not be recorded.
    # all changes adjusts settings to avoid double running mongodb jobs.
    # Hint: make sure SPP-mongodb tables are correctly indexed.

    # Priority list for manual changes:

    # Only if unable to connect at all:
    # 1. Increase initial_connection_timeout

    # Small/Medium Spikes:

    # finetune `default` variables:
    # 1. increase timeout while decreasing preferred send time (timeout disable: None)
    # 2. increase timeout reduction (0-0.99)
    # 3. decrease scaling factor (>1)

    # Critical/Big Spikes:

    # CAUTION Reduce Recording: causes less Joblogs-Types to be recorded
    # 1. Enable `--loadedSystem`
    # 2. finetune `loaded`-variables (see medium spikes 1-3)
    # 3. Reduce JobLog-Types (min only `SUMMARY`)

    # Other finetuning mechanics (no data-loss):
    # 1. decrease allowed_send_delta (>=0)
    # 2. decrease starting pagesize (>1)

    # Pagesize size
    starting_page_size: int = 50
    """starting page size for dynamical change within rest_client"""
    loaded_starting_page_size: int = 10
    """starting page size for dynamical change within rest_client on loaded systems"""

    min_page_size: int = 5
    """minimum size of a rest-api page"""
    loaded_min_page_size: int = 1
    """minimum size of a rest-api page on loaded systems"""

    # Increase / Decrease of pagesize
    max_scaling_factor: float = 3.5
    """max scaling factor of the pagesize increase per request"""
    loaded_max_scaling_factor: float = 2.0
    """max scaling factor of the pagesize increase per request for loaded systems"""

    timeout_reduction: float = 0.7
    """reduce of the actual pagesize on timeout in percent"""
    loaded_timeout_reduction: float = 0.95
    """reduce of the actual pagesize on timeout in percent on loaded systems"""

    allowed_send_delta: float = 0.15
    """delta of send allowed before adjustments are made to the pagesize in %"""
    loaded_allowed_send_delta: float = 0.15
    """delta of send allowed before adjustments are made to the pagesize in % on loaded systems"""

    # Send time and timeouts
    pref_send_time: int = 30
    """preferred query send time in seconds"""
    loaded_pref_send_time: int = 30
    """desired send time per query in seconds for loaded systems"""

    initial_connection_timeout: float = 6.05
    """Time spend waiting for the initial connection, slightly larger than 3 multiple"""

    request_timeout: int | None = 60
    """timeout for api-requests, none deactivates timeout"""
    loaded_request_timeout: int | None = 180
    """timeout on loaded systems, none deactivates timeout"""

    max_send_retries: int = 3
    """Count of retries before failing request. Last one is min size. 0 to disable."""
    loaded_max_send_retries: int = 1
    """Count of retries before failing request on loaded systems. Last one is min size. 0 to disable."""

    # ## REST-CLIENT-OPTIONS ##

    # Never observed debug-type
    # possible options: '["INFO","DEBUG","ERROR","SUMMARY","WARN", "DETAIL"]'
    joblog_types = ["SUMMARY"]
    """joblog query types on normal running systems"""
    full_joblog_types = ["INFO", "DEBUG", "ERROR", "SUMMARY", "WARN", "DETAIL"]
    """jobLog types to be requested on full logs."""

    # String, cause of days etc
    # ### DATALOSS if turned down ###
    job_log_retention_time = "60d"
    """Configured spp log rentation time, logs get deleted after this time."""

    # set later in each method, here to avoid missing attribute
    influx_client: Optional[InfluxClient] = None
    rest_client: Optional[RestClient] = None
    api_queries: Optional[ApiQueries] = None
    system_methods: Optional[SystemMethods] = None
    job_methods: Optional[JobMethods] = None
    protection_methods: Optional[ProtectionMethods] = None
    ssh_methods: Optional[SshMethods] = None

    def __init__(self):
        self.log_path: str = ""
        """path to logger, set in set_logger."""
        self.pid_file_path: str = ""
        """path to pid_file, set in check_pid_file."""

        self.set_logger()

        LOGGER.info("Starting SPPMon")
        if (not self.check_pid_file()):
            ExceptionUtils.error_message(
                "Another instance of sppmon with the same args is running")
            self.exit(ERROR_CODE_START_ERROR)

        time_stamp_name, time_stamp = SppUtils.get_capture_timestamp_sec()
        self.start_counter = time.perf_counter()
        LOGGER.debug("\n\n")
        LOGGER.debug(f"running script version: {VERSION}")
        LOGGER.debug(f"cmdline options: {ARGS}")
        LOGGER.debug(f"{time_stamp_name}: {time_stamp}")
        LOGGER.debug("")

        if (not ARGS.configFile):
            ExceptionUtils.error_message("missing config file, aborting")
            self.exit(error_code=ERROR_CODE_CMD_ARGS)
        try:
            self.config_file = SppUtils.read_conf_file(
                config_file_path=ARGS.configFile)
        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message=
                "Error when trying to read Config file, unable to read")
            self.exit(error_code=ERROR_CODE_START_ERROR)

        LOGGER.info("Setting up configurations")
        self.setup_args()
        self.set_critial_configs(self.config_file)
        self.set_optional_configs(self.config_file)

    def set_logger(self) -> None:
        """Sets global logger for stdout and file logging.

        Changes logger aquired by LOGGER_NAME.

        Raises:
            ValueError: Unable to open logger

        """
        self.log_path = SppUtils.mk_logger_file(ARGS.configFile, ".log")

        try:
            file_handler = logging.FileHandler(self.log_path)
        except Exception as error:
            # TODO here: Right exception, how to print this error?
            print("unable to open logger", file=sys.stderr)
            raise ValueError("Unable to open Logger") from error

        file_handler_fmt = logging.Formatter(
            '%(asctime)s:[PID %(process)d]:%(levelname)s:%(module)s.%(funcName)s> %(message)s'
        )
        file_handler.setFormatter(file_handler_fmt)
        if (ARGS.debug):
            file_handler.setLevel(logging.DEBUG)
        else:
            file_handler.setLevel(logging.ERROR)

        stream_handler = logging.StreamHandler()
        stream_handler.setLevel(logging.INFO)

        logger = logging.getLogger(LOGGER_NAME)
        logger.setLevel(logging.DEBUG)
        logger.addHandler(file_handler)
        logger.addHandler(stream_handler)

    def check_pid_file(self) -> bool:
        if (ARGS.verbose):
            LOGGER.info("Checking for other SPPMon instances")
        self.pid_file_path = SppUtils.mk_logger_file(ARGS.configFile,
                                                     ".pid_file")
        try:
            try:
                file = open(self.pid_file_path, "rt")
                match_list = re.findall(r"(\d+) " + str(ARGS), file.read())
                file.close()
                deleted_processes: List[str] = []
                for match in match_list:
                    # add spaces to make clear the whole number is matched
                    match = f' {match} '
                    try:
                        if (os.name == 'nt'):
                            args = ['ps', '-W']
                        else:
                            args = ['ps', '-p', match]
                        result = subprocess.run(args,
                                                check=True,
                                                capture_output=True)
                        if (re.search(match, str(result.stdout))):
                            return False
                        # not in there -> delete entry
                        deleted_processes.append(match)
                    except CalledProcessError as error:
                        deleted_processes.append(match)

                # delete processes which did get killed, not often called
                if (deleted_processes):
                    file = open(self.pid_file_path, "rt")
                    file_str = file.read()
                    file.close()
                    options = str(ARGS)
                    for pid in deleted_processes:
                        file_str = file_str.replace(f"{pid} {options}", "")
                    # do not delete if empty since we will use it below
                    file = open(self.pid_file_path, "wt")
                    file.write(file_str.strip())
                    file.close()

            except FileNotFoundError:
                pass  # no file created yet

            # always write your own pid into it
            file = open(self.pid_file_path, "at")
            file.write(f"{os.getpid()} {str(ARGS)}")
            file.close()
            return True
        except Exception as error:
            ExceptionUtils.exception_info(error)
            raise ValueError("Error when checking pid file")

    def remove_pid_file(self) -> None:
        try:
            file = open(self.pid_file_path, "rt")
            file_str = file.read()
            file.close()
            new_file_str = file_str.replace(f"{os.getpid()} {str(ARGS)}",
                                            "").strip()
            if (not new_file_str.strip()):
                os.remove(self.pid_file_path)
            else:
                file = open(self.pid_file_path, "wt")
                file.write(new_file_str)
                file.close()
        except Exception as error:
            ExceptionUtils.exception_info(error,
                                          "Error when removing pid_file")

    def set_critial_configs(self, config_file: Dict[str, Any]) -> None:
        """Sets up any critical infrastructure, to be called within the init.

        Be aware not everything may be initalized on call time.
        Add config here if the system should abort if it is missing.

        Arguments:
            config_file {Dict[str, Any]} -- Opened Config file
        """
        if (not config_file):
            ExceptionUtils.error_message(
                "missing or empty config file, aborting")
            self.exit(error_code=ERROR_CODE_START_ERROR)
        try:
            # critical components only
            self.influx_client = InfluxClient(config_file)

            if (not self.ignore_setup):
                # delay the connect into the testing phase
                self.influx_client.connect()

        except ValueError as err:
            ExceptionUtils.exception_info(
                error=err,
                extra_message="error while setting up critical config. Aborting"
            )
            self.influx_client = None  # set none, otherwise the variable is undeclared
            self.exit(error_code=ERROR_CODE)

    def set_optional_configs(self, config_file: Dict[str, Any]) -> None:
        """Sets up any optional infrastructure, to be called within the init.

        Be aware not everything may be initalized on call time.
        Add config here if the system should not abort if it is missing.

        Arguments:
            config_file {Dict[str, Any]} -- Opened Config file
        """

        if (not config_file):
            ExceptionUtils.error_message(
                "missing or empty config file, aborting.")
            self.exit(error_code=ERROR_CODE_START_ERROR)
        if (not self.influx_client):
            ExceptionUtils.error_message(
                "Influx client is somehow missing. aborting")
            self.exit(error_code=ERROR_CODE)

        # ############################ REST-API #####################################
        try:
            ConnectionUtils.verbose = ARGS.verbose
            # ### Loaded Systems part 1/2 ### #
            if (ARGS.minimumLogs or ARGS.loadedSystem):
                # Setting pagesize scaling settings
                ConnectionUtils.timeout_reduction = self.loaded_timeout_reduction
                ConnectionUtils.allowed_send_delta = self.loaded_allowed_send_delta
                ConnectionUtils.max_scaling_factor = self.loaded_max_scaling_factor

                # Setting RestClient request settings.
                self.rest_client = RestClient(
                    config_file=config_file,
                    initial_connection_timeout=self.initial_connection_timeout,
                    pref_send_time=self.loaded_pref_send_time,
                    request_timeout=self.loaded_request_timeout,
                    max_send_retries=self.loaded_max_send_retries,
                    starting_page_size=self.loaded_starting_page_size,
                    min_page_size=self.loaded_min_page_size,
                    verbose=ARGS.verbose)
            else:
                ConnectionUtils.timeout_reduction = self.timeout_reduction
                ConnectionUtils.allowed_send_delta = self.allowed_send_delta
                ConnectionUtils.max_scaling_factor = self.max_scaling_factor

                # Setting RestClient request settings.
                self.rest_client = RestClient(
                    config_file=config_file,
                    initial_connection_timeout=self.initial_connection_timeout,
                    pref_send_time=self.pref_send_time,
                    request_timeout=self.request_timeout,
                    max_send_retries=self.max_send_retries,
                    starting_page_size=self.starting_page_size,
                    min_page_size=self.min_page_size,
                    verbose=ARGS.verbose)

            self.api_queries = ApiQueries(self.rest_client)
            if (not self.ignore_setup):
                # delay the connect into the testing phase
                self.rest_client.login()

        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="REST-API is not available due Config error")
            # Required to declare variable
            self.rest_client = None
            self.api_queries = None

        # ######################## System, Job and Hypervisor Methods ##################
        try:
            # explicit ahead due dependency
            self.system_methods = SystemMethods(self.influx_client,
                                                self.api_queries, ARGS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        # ### Full Logs ### #
        if (ARGS.fullLogs):
            given_log_types = self.full_joblog_types
        else:
            given_log_types = self.joblog_types

        try:
            auth_rest: Dict[str, Any] = SppUtils.get_cfg_params(
                param_dict=config_file, param_name="sppServer")  # type: ignore
            # TODO DEPRECATED TO BE REMOVED IN 1.1
            self.job_log_retention_time = auth_rest.get(
                "jobLog_rentation",
                auth_rest.get("jobLog_retention", self.job_log_retention_time))
            # TODO New once 1.1 is live
            #self.job_log_retention_time = auth_rest.get("jobLog_retention", self.job_log_retention_time)

            self.job_methods = JobMethods(self.influx_client, self.api_queries,
                                          self.job_log_retention_time,
                                          given_log_types, ARGS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        try:
            # dependen on system methods
            self.protection_methods = ProtectionMethods(
                self.system_methods, self.influx_client, self.api_queries,
                ARGS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        # ############################### SSH #####################################
        if (self.ssh and not self.ignore_setup):
            try:
                # set from None to methods once finished
                self.ssh_methods = SshMethods(influx_client=self.influx_client,
                                              config_file=config_file,
                                              verbose=ARGS.verbose)

            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "SSH-Commands are not available due Config error")
                # Variable needs to be declared
                self.ssh_methods = None
        else:
            # Variable needs to be declared
            self.ssh_methods = None

    def setup_args(self) -> None:
        """This method set up all required parameters and transforms arg groups into individual args.
        """
        # ## call functions based on cmdline parameters

        # Temporary features / Deprecated

        if (ARGS.minimumLogs):
            ExceptionUtils.error_message(
                "DEPRECATED: using deprecated argument '--minumumLogs'. Use to '--loadedSystem' instead."
            )
        if (ARGS.processStats):
            ExceptionUtils.error_message(
                "DEPRECATED: using deprecated argument '--minumumLogs'. Use to '--ssh' instead."
            )

        # ignore setup args
        self.ignore_setup: bool = (ARGS.create_dashboard
                                   or bool(ARGS.dashboard_folder_path)
                                   or ARGS.test)
        if (self.ignore_setup):
            ExceptionUtils.error_message(
                "> WARNING: An option for a utility operation has been specified.  Bypassing normal SPPMON operation."
            )

        if ((ARGS.create_dashboard or bool(ARGS.dashboard_folder_path))
                and not (ARGS.create_dashboard
                         and bool(ARGS.dashboard_folder_path))):
            ExceptionUtils.error_message(
                "> Using --create_dashboard without associated folder path. Aborting."
            )
            self.exit(ERROR_CODE_CMD_ARGS)

        # incremental setup, higher executes all below
        all_args: bool = ARGS.all
        daily: bool = ARGS.daily or all_args
        hourly: bool = ARGS.hourly or daily
        constant: bool = ARGS.constant or hourly

        # ######## All Methods #################

        self.sites: bool = ARGS.sites or all_args

        # ######## Daily Methods ###############

        self.vms: bool = ARGS.vms or daily
        self.job_logs: bool = ARGS.jobLogs or daily
        self.sla_stats: bool = ARGS.slaStats or daily
        self.vm_stats: bool = ARGS.vmStats or daily

        # ######## Hourly Methods ##############

        self.jobs: bool = ARGS.jobs or hourly
        self.vadps: bool = ARGS.vadps or hourly
        self.storages: bool = ARGS.storages or hourly
        # ssh vsnap pools ?

        # ######## Constant Methods ############

        self.ssh: bool = ARGS.ssh or constant
        self.cpu: bool = ARGS.cpu or constant
        self.spp_catalog: bool = ARGS.sppcatalog or constant

    def store_script_metrics(self) -> None:
        """Stores script metrics into influxb. To be called before exit.

        Does not raise any exceptions, skips if influxdb is missing.
        """
        LOGGER.info("Storing script metrics")
        try:
            if (not self.influx_client):
                raise ValueError("no influxClient set up")
            insert_dict: Dict[str, Union[str, int, float, bool]] = {}

            # add version nr, api calls are needed
            insert_dict["sppmon_version"] = VERSION
            insert_dict["influxdb_version"] = self.influx_client.version
            if (self.rest_client):
                try:
                    (version_nr,
                     build) = self.rest_client.get_spp_version_build()
                    insert_dict["spp_version"] = version_nr
                    insert_dict["spp_build"] = build
                except ValueError as error:
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message="could not query SPP version and build.")

            # end total sppmon runtime
            end_counter = time.perf_counter()
            insert_dict['duration'] = int(
                (end_counter - self.start_counter) * 1000)

            # add arguments of sppmon
            for (key, value) in vars(ARGS).items():
                # Value is either string, true or false/None
                if (value):
                    insert_dict[key] = value

            # save occured errors
            error_count = len(ExceptionUtils.stored_errors)
            if (error_count > 0):
                ExceptionUtils.error_message(
                    f"total of {error_count} exception/s occured")
            insert_dict['errorCount'] = error_count
            # save list as str if not empty
            if (ExceptionUtils.stored_errors):
                insert_dict['errorMessages'] = str(
                    ExceptionUtils.stored_errors)

            # get end timestamp
            (time_key, time_val) = SppUtils.get_capture_timestamp_sec()
            insert_dict[time_key] = time_val

            # save the metrics
            self.influx_client.insert_dicts_to_buffer(
                table_name="sppmon_metrics", list_with_dicts=[insert_dict])
            self.influx_client.flush_insert_buffer()
            LOGGER.info("Stored script metrics sucessfull")
            # + 1 due the "total of x exception/s occured"
            if (error_count + 1 < len(ExceptionUtils.stored_errors)):
                ExceptionUtils.error_message(
                    "A non-critical error occured while storing script metrics. \n\
                    This error can't be saved into the DB, it's only displayed within the logs."
                )
        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message=
                "Error when storing sppmon-metrics, skipping this step. Possible insert-buffer data loss"
            )

    def exit(self, error_code: int = SUCCESS_CODE) -> NoReturn:
        """Executes finishing tasks and exits sppmon. To be called every time.

        Executes finishing tasks and displays error messages.
        Specify only error message if something did went wrong.
        Use Error codes specified at top of module.
        Does NOT return.

        Keyword Arguments:
            error {int} -- Errorcode if a error occured. (default: {0})
        """

        # error with the command line arguments
        # dont store runtime here
        if (error_code == ERROR_CODE_CMD_ARGS):
            parser.print_help()
            sys.exit(ERROR_CODE_CMD_ARGS)  # unreachable?
        if (error_code == ERROR_CODE_START_ERROR):
            ExceptionUtils.error_message(
                "Error when starting SPPMon. Please review the errors above")
            sys.exit(ERROR_CODE_START_ERROR)

        script_end_time = SppUtils.get_actual_time_sec()
        LOGGER.debug("Script end time: %d", script_end_time)

        try:
            if (not self.ignore_setup):
                self.store_script_metrics()

                if (self.influx_client):
                    self.influx_client.disconnect()
                if (self.rest_client):
                    self.rest_client.logout()

        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="Error occured while exiting sppmon")
            error_code = ERROR_CODE

        self.remove_pid_file()

        # Both error-clauses are actually the same, but for possiblility of an split between error cases
        # always last due beeing true for any number != 0
        if (error_code == ERROR_CODE or error_code):
            ExceptionUtils.error_message(
                "Error occured while executing sppmon")
        elif (not self.ignore_setup):
            LOGGER.info("\n\n!!! script completed !!!\n")

        print(
            f"check log for details: grep \"PID {os.getpid()}\" {self.log_path} > sppmon.log.{os.getpid()}"
        )
        sys.exit(error_code)

    def main(self):

        LOGGER.info("Starting argument execution")

        if (not self.influx_client):
            ExceptionUtils.error_message(
                "somehow no influx client is present even after init")
            self.exit(ERROR_CODE)

        # ##################### SYSTEM METHODS #######################
        if (self.sites and self.system_methods):
            try:
                self.system_methods.sites()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting sites, skipping them all")

        if (self.cpu and self.system_methods):
            try:
                self.system_methods.cpuram()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting cpu stats, skipping them all"
                )

        if (self.spp_catalog and self.system_methods):
            try:
                self.system_methods.sppcatalog()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting file system stats, skipping them all"
                )

        # ####################### JOB METHODS ########################
        if (self.jobs and self.job_methods):
            # store all jobs grouped by jobID
            try:
                self.job_methods.get_all_jobs()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting jobs, skipping them all")

        if (self.job_logs and self.job_methods):
            # store all job logs per job session instance
            try:
                self.job_methods.job_logs()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting job logs, skipping them all"
                )

        # ####################### SSH METHODS ########################
        if (self.ssh and self.ssh_methods):
            # execute ssh statements for, VSNAP, VADP, other ssh hosts
            # store all job logs per job session instance
            try:
                self.ssh_methods.ssh()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when excecuting ssh commands, skipping them all"
                )

        # ################### HYPERVISOR METHODS #####################
        if (self.vms and self.protection_methods):
            try:
                self.protection_methods.store_vms()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting all VMs, skipping them all"
                )

        if (self.sla_stats and self.protection_methods):
            # number of VMs per SLA and sla dumps
            try:
                self.protection_methods.vms_per_sla()
                self.protection_methods.sla_dumps()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting and computing VMs per sla, skipping them all"
                )

        if (self.vm_stats and self.protection_methods):
            # retrieve and calculate VM inventory summary
            try:
                self.protection_methods.create_inventory_summary()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when creating inventory summary, skipping them all"
                )

        if (self.vadps and self.protection_methods):
            try:
                self.protection_methods.vadps()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting vadps, skipping them all")

        if (self.storages and self.protection_methods):
            try:
                self.protection_methods.storages()
                self.influx_client.flush_insert_buffer()
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting storages, skipping them all"
                )

        # ###################### OTHER METHODS #######################

        if (ARGS.copy_database):
            try:
                self.influx_client.copy_database(ARGS.copy_database)
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when coping database.")

        # ################### NON-SETUP-METHODS #######################

        if (ARGS.test):
            try:
                TestingMethods.test_connection(self.config_file,
                                               self.influx_client,
                                               self.rest_client)
            except Exception as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when testing connection.")

        # DEPRECATED TODO REMOVE NEXT VERSION
        if (ARGS.create_dashboard):
            try:
                ExceptionUtils.error_message(
                    "This method is deprecated. You do not need to manually create a dashboard anymore.\n"
                    +
                    "Please just select the datasource when importing the regular 14-day dashboard in grafana.\n"
                    +
                    "Devs may adjust their dashboard to be generic with the scripts/generifyDashboard.py script."
                )
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when creating dashboard")

        self.exit()
Ejemplo n.º 3
0
    def test_connection(influx_client: InfluxClient, rest_client: RestClient,
                        config_file: Dict[str, Any]):
        if (not config_file):
            raise ValueError("SPPmon does not work without a config file")

        LOGGER.info("Testing all connections required for SPPMon to work")
        working: bool = True  # SPPMon itself will finish sucessfull (no critical errors)
        no_warnings: bool = True  # SPPMon will finish without any warnings (no errors at all)

        # ## InfluxDB ##

        LOGGER.info("> Testing and configuring InfluxDB")
        try:
            influx_client.connect()
            influx_client.disconnect()
            if (not influx_client.use_ssl):
                ExceptionUtils.error_message(
                    "> WARNING: Mandatory SSL is disabled. We hightly recommend to enable it!"
                )
                no_warnings = False

            LOGGER.info("InfluxDB is ready for use")
        except ValueError as error:
            ExceptionUtils.exception_info(
                error,
                extra_message=
                "> Testing of the InfluxDB failed. This is a crictial component of SPPMon."
            )
            working = False

        # ## REST-API ##

        LOGGER.info("> Testing REST-API of SPP.")
        try:
            rest_client.login()
            (version_nr, build_nr) = rest_client.get_spp_version_build()
            LOGGER.info(
                f">> Sucessfully connected to SPP V{version_nr}, build {build_nr}."
            )
            rest_client.logout()
            LOGGER.info("> REST-API is ready for use")
        except ValueError as error:
            ExceptionUtils.exception_info(
                error,
                extra_message=
                "> Testing of the REST-API failed. This is a crictial component of SPPMon."
            )
            working = False

        # ## SSH-CLIENTS ##

        LOGGER.info(
            "> Testing all types of SSH-Clients: Server, VAPDs, vSnaps, Cloudproxy and others"
        )
        ssh_working = True  # The arg --ssh will finish without any error at all

        # Count of clients checks
        ssh_clients: List[SshClient] = SshMethods.setup_ssh_clients(
            config_file)
        if (not ssh_clients):
            ExceptionUtils.error_message(
                ">> No SSH-clients detected at all. At least the server itself should be added for process-statistics."
            )
            ssh_working = False
        else:
            for type in SshTypes:
                if (not list(
                        filter(lambda client: client.client_type == type,
                               ssh_clients))):
                    LOGGER.info(f">> No {type.name} client detected.")

                    if (type == SshTypes.SERVER):
                        ExceptionUtils.error_message(
                            ">> Critical: Without Server as ssh client you wont have any process statistics available. These are a key part of SPPMon."
                        )
                        ssh_working = False  # No error, but still critical

                    if (type == SshTypes.VSNAP):
                        LOGGER.info(
                            ">> WARNING: Without vSnap as ssh client you have no access to storage information. You may add vSnap's for additional monitoring and alerts."
                        )
                        no_warnings = False  # ssh will still work, but thats definitly a warning

            ssh_methods: SshMethods = SshMethods(influx_client, config_file,
                                                 False)
            # Connection check
            LOGGER.info(
                f">> Testing now connection and commands of {len(ssh_clients)} registered ssh-clients."
            )
            for client in ssh_clients:
                try:
                    client.connect()
                    client.disconnect()

                    error_count: int = len(ExceptionUtils.stored_errors)
                    MethodUtils.ssh_execute_commands(
                        ssh_clients=[client],
                        ssh_type=client.client_type,
                        command_list=ssh_methods.client_commands[
                            client.client_type] + ssh_methods.all_command_list)
                    if (len(ExceptionUtils.stored_errors) != error_count):
                        ssh_working = False
                        ExceptionUtils.error_message(
                            f"Not all commands available for client {client.host_name} with type: {client.client_type}.\n"
                            +
                            "Please check manually if the commands are installed and their output."
                        )

                except ValueError as error:
                    ExceptionUtils.exception_info(
                        error,
                        extra_message=
                        f"Connection failed for client {client.host_name} with type: {client.client_type}."
                    )
                    ssh_working = False

        if (ssh_working):
            LOGGER.info("> Testing of SSH-clients sucessfull.")
        else:
            LOGGER.info(
                "> Testing of SSH-clients failed! SPPMon will still work, not all informations are available."
            )
            no_warnings = False

        # #### Conclusion ####

        if (working and no_warnings):
            LOGGER.info(
                "> All components tested sucessfully. SPPMon is ready to be used!"
            )
        elif (working):
            LOGGER.info(
                "> Testing partially sucessful. SPPMon will run, but please check the warnings."
            )
        else:
            LOGGER.info(
                "> Testing failed. SPPMon is not ready to be used. Please fix the connection issues."
            )