def job_logs(self) -> None:
        """saves all jobLogs for the jobsessions in influx catalog.

        Make sure to call `get_all_jobs` before to aquire all jobsessions.
        In order to save them it deletes and rewrites all affected jobsession entrys.
        It automatically parses certain jobLogs into additional stats, defined by `supported_ids`.
        """

        table = self.__influx_client.database['jobs']
        # only store if there is something to store -> limited by job log rentation time.
        where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time
        where_str += f' AND time > now() - {table.retention_policy.duration}'

        jobs_updated = 0
        logs_total_count = 0
        LOGGER.info("> getting joblogs for jobsessions without saved logs")
        LOGGER.info(">> requesting jobList from database")

        # Select all jobs without joblogs
        keyword = Keyword.SELECT
        query = SelectionQuery(
            keyword=keyword,
            tables=[table],
            fields=['*'],
            where_str=where_str
        )
        # send query and compute
        result = self.__influx_client.send_selection_query(query) # type: ignore
        result_list: List[Dict[str, Any]] = list(result.get_points()) # type: ignore

        rows_affected = len(result_list)


        LOGGER.info(">>> number of jobs with no joblogs stored in Influx database: {}"
                    .format(rows_affected))

        job_log_dict: Dict[int, List[Dict[str, Any]]] = {}

        # request all jobLogs from REST-API
        # if errors occur, skip single row and debug
        for row in result_list:
            job_session_id: Optional[int] = row.get('id', None)

            # if somehow id is missing: skip
            if(job_session_id is None):
                ExceptionUtils.error_message(f"Error: joblogId missing for row {row}")
                continue

            if(job_session_id in job_log_dict):
                ExceptionUtils.error_message(f"Error: joblogId duplicate, skipping.{job_session_id}")
                continue

            if(self.__verbose):
                LOGGER.info(
                    f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions.")
            elif(len(job_log_dict) % 5 == 0):
                LOGGER.info(
                    f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions.")

            # request job_session_id
            try:
                if(self.__verbose):
                    LOGGER.info(f"requesting jobLogs {self.__job_log_type} for session {job_session_id}.")
                LOGGER.debug(f"requesting jobLogs {self.__job_log_type} for session {job_session_id}.")

                # cant use query something like everwhere due the extra params needed
                job_log_list = self.__api_queries.get_job_log_details(
                    jobsession_id=job_session_id,
                    job_logs_type=self.__job_log_type)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=f"error when api-requesting joblogs for job_session_id {job_session_id}, skipping it")
                continue

            if(self.__verbose):
                LOGGER.info(f">>> Found {len(job_log_list)} logs for jobsessionId {job_session_id}")

            LOGGER.debug(f"Found {len(job_log_list)} logs for jobsessionId {job_session_id}")
            # default empty list if no details available -> should not happen, in for safty reasons
            # if this is none, go down to rest client and fix it. Should be empty list.
            if(job_log_list is None):
                job_log_list = []
                ExceptionUtils.error_message(
                    "A joblog_list was none, even if the type does not allow it. Please report to developers.")
            job_log_dict[job_session_id] = job_log_list

        # list to be inserted after everything is updated
        insert_list: List[Dict[str, Any]] = []

        # Query data in ranges to avoid too many requests
        # Results from first select query above
        for row in result_list:
            job_id: int = row['id']
            job_log_list: Optional[List[Dict[str, Any]]] = job_log_dict.get(job_id, None)

            if(job_log_list is None):
                ExceptionUtils.error_message(
                    f"missing job_log_list even though it is in influxdb for jobId {job_id}. Skipping it")
                continue

            # jobLogsCount will be zero if jobLogs are deleted after X days by maintenance jobs, GUI default is 60 days
            job_logs_count = len(job_log_list)
            if(self.__verbose):
                LOGGER.info(">>> storing {} joblogs for jobsessionId: {} in Influx database".format(
                    len(job_log_list), job_id))
            LOGGER.debug(">>> storing {} joblogs for jobsessionId: {} in Influx database".format(
                len(job_log_list), job_id))

            # compute other stats out of jobList
            try:
                self.__job_logs_to_stats(job_log_list)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error, extra_message=f"Failed to compute stats out of job logs, skipping for jobsessionId {job_id}")

            for job_log in job_log_list:
                # rename key 'id' to jobLogId and reformat messageParams
                job_log["jobSessionId"] = row.get("jobId", None)
                job_log["jobSessionName"] = row.get("jobName", None)
                job_log["jobLogId"] = job_log.pop("id")
                job_log["messageParams"] = json.dumps(
                    job_log["messageParams"])

            # if list is empty due beeing erased etc it will simply return and do nothing
            self.__influx_client.insert_dicts_to_buffer(
                list_with_dicts=job_log_list, table_name="jobLogs")


            jobs_updated += 1
            logs_total_count += job_logs_count
            # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails)
            update_fields = {
                "jobLogsCount": job_logs_count,
                "jobsLogsStored": True
            }
            # copy dict to allow update without errors
            mydict = dict(row.items())
            # update fields
            for(key, value) in update_fields.items():
                mydict[key] = value
            insert_list.append(mydict)

        # Delete data to allow reinsert with different tags
        delete_query = SelectionQuery(
            keyword=Keyword.DELETE,
            tables=[table],
            where_str=where_str
        )

        # now send remove query to prevent data loss
        self.__influx_client.send_selection_query(delete_query)

        # Insert data after everything is completed
        self.__influx_client.insert_dicts_to_buffer(table.name, insert_list)

        LOGGER.info(">>> inserted a total of {} logs".format(logs_total_count))
    def __query_url(self, url: str) -> Tuple[Dict[str, Any], float]:
        """Sends a request to this endpoint. Repeats if timeout error occured.

        Adust the pagesize on timeout.

        Arguments:
            url {str} -- URL to be queried.

        Raises:
            ValueError: No URL specified
            ValueError: Error when requesting endpoint
            ValueError: Wrong status code
            ValueError: failed to parse result
            ValueError: Timeout when sending result

        Returns:
            Tuple[Dict[str, Any], float] -- Result of the request with the required send time
        """
        if (not url):
            raise ValueError("no url specified")

        LOGGER.debug(f"endpoint request {url}")

        failed_trys: int = 0
        response_query: Optional[Response] = None

        while (response_query is None):

            # read pagesize
            actual_page_size = ConnectionUtils.url_get_param_value(
                url=url, param_name="pageSize")

            # Always set Pagesize to avoid different pagesizes by system
            if (not actual_page_size):
                url = ConnectionUtils.url_set_param(
                    url=url,
                    param_name="pageSize",
                    param_value=self.__page_size)
            else:
                # read the pagesize
                try:
                    actual_page_size = int(actual_page_size[0])
                except (ValueError, KeyError) as error:
                    ExceptionUtils.exception_info(
                        error, extra_message="invalid page size recorded")
                    actual_page_size = -1

            # adjust pagesize of url
            if (actual_page_size != self.__page_size):
                LOGGER.debug(
                    f"setting new pageSize from {actual_page_size} to {self.__page_size}"
                )
                url = ConnectionUtils.url_set_param(
                    url=url,
                    param_name="pageSize",
                    param_value=self.__page_size)

            # send the query
            try:
                start_time = time.perf_counter()
                response_query = requests.get(  # type: ignore
                    url=url,
                    headers=self.__headers,
                    verify=False,
                    timeout=self.__timeout)
                end_time = time.perf_counter()
                send_time = (end_time - start_time)

            except requests.exceptions.ReadTimeout as timeout_error:

                # timeout occured, increasing failed trys
                failed_trys += 1

                # #### Aborting cases ######
                if (self.__send_retries < failed_trys):
                    ExceptionUtils.exception_info(error=timeout_error)
                    # read start index for debugging
                    start_index = ConnectionUtils.url_get_param_value(
                        url=url, param_name="pageStartIndex")
                    # report timeout with full information
                    raise ValueError(
                        "timeout after repeating a maximum ammount of times.",
                        timeout_error, failed_trys, self.__page_size,
                        start_index)

                if (self.__page_size == self.__min_page_size):
                    ExceptionUtils.exception_info(error=timeout_error)
                    # read start index for debugging
                    start_index = ConnectionUtils.url_get_param_value(
                        url=url, param_name="pageStartIndex")
                    # report timeout with full information
                    raise ValueError(
                        "timeout after using minumum pagesize. repeating the request is of no use.",
                        timeout_error, failed_trys, self.__page_size,
                        start_index)

                # #### continuing cases ######
                if (self.__send_retries == failed_trys):  # last try
                    LOGGER.debug(
                        f"Timeout error when requesting, now last try of total {self.__send_retries}. Reducing pagesize to minimum for url: {url}"
                    )
                    if (self.__verbose):
                        LOGGER.info(
                            f"Timeout error when requesting, now last try of total {self.__send_retries}. Reducing pagesize to minimum for url: {url}"
                        )

                    self.__page_size = self.__min_page_size
                    # repeat with minimal possible size

                elif (self.__send_retries >
                      failed_trys):  # more then 1 try left
                    LOGGER.debug(
                        f"Timeout error when requesting, now on try {failed_trys} of {self.__send_retries}. Reducing pagesizefor url: {url}"
                    )
                    if (self.__verbose):
                        LOGGER.info(
                            f"Timeout error when requesting, now on try {failed_trys} of {self.__send_retries}. Reducing pagesize for url: {url}"
                        )
                    self.__page_size = ConnectionUtils.adjust_page_size(
                        page_size=self.__page_size,
                        min_page_size=self.__min_page_size,
                        time_out=True)
                    # repeat with reduced page size

            except requests.exceptions.RequestException as error:
                ExceptionUtils.exception_info(error=error)
                raise ValueError("error when requesting endpoint", error)

        if response_query.status_code != 200:
            raise ValueError("Wrong Status code when requesting endpoint data",
                             response_query.status_code, url, response_query)

        try:
            response_json: Dict[str, Any] = response_query.json()
        except (json.decoder.JSONDecodeError,
                ValueError) as error:  # type: ignore
            raise ValueError("failed to parse query in restAPI post request",
                             response_query)  # type: ignore

        return (response_json, send_time)
    def post_data(
            self,
            endpoint: str = None,
            url: str = None,
            post_data: str = None,
            auth: HTTPBasicAuth = None) -> Dict[str, Any]:  # type: ignore
        """Queries endpoint by a POST-Request.

        Only specify `auth` if you want to log in. Either specify endpoint or url.

        Keyword Arguments:
            endpoint {str} -- Endpoint to be queried (default: {None})
            url {str} -- URL to be queried (default: {None})
            post_data {str} -- data with filters/parameters (default: {None})
            auth {HTTPBasicAuth} -- auth if you want to log in (default: {None})

        Raises:
            ValueError: no endpoint or url specified
            ValueError: both url and endpoint specified
            ValueError: no post_data or auth specified
            ValueError: error when sending post data
            ValueError: wrong status code in response
            ValueError: failed to parse query

        Returns:
            Dict[str, Any] -- [description]
        """
        if (not endpoint and not url):
            raise ValueError("neither url nor endpoint specified")
        if (endpoint and url):
            raise ValueError("both url and endpoint specified")
        if (not post_data and not auth):
            raise ValueError("either provide auth or post_data")

        if (not url):
            url = self.__srv_url + endpoint

        LOGGER.debug(f"post_data request {url} {post_data} {auth}")

        try:
            if (post_data):
                response_query: Response = requests.post(  # type: ignore
                    url,
                    headers=self.__headers,
                    data=post_data,
                    verify=False,
                    timeout=self.__timeout)
            else:
                response_query: Response = requests.post(  # type: ignore
                    url,
                    headers=self.__headers,
                    auth=auth,
                    verify=False,
                    timeout=self.__timeout)
        except requests.exceptions.RequestException as error:  # type: ignore
            ExceptionUtils.exception_info(error=error)  # type: ignore
            raise ValueError("Error when sending REST-API post data", endpoint,
                             post_data)

        if response_query.status_code != 200:
            raise ValueError(
                "Status Code Error in REST-API post data response",
                response_query.status_code, response_query, endpoint,
                post_data)  # type: ignore

        try:
            response_json: Dict[str, Any] = response_query.json()
        except (json.decoder.JSONDecodeError,
                ValueError) as error:  # type: ignore
            raise ValueError("failed to parse query in restAPI post request",
                             response_query, endpoint,
                             post_data)  # type: ignore

        return response_json
    def main(self):

        if (not self.influx_client):
            ExceptionUtils.error_message(
                "somehow no influx client is present even after init")
            self.exit(ERROR_CODE)

        # ##################### SYSTEM METHODS #######################
        if (self.sites and self.system_methods):
            try:
                self.system_methods.sites()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting sites, skipping them all")

        if (self.cpu and self.system_methods):
            try:
                self.system_methods.cpuram()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting cpu stats, skipping them all"
                )

        if (self.spp_catalog and self.system_methods):
            try:
                self.system_methods.sppcatalog()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting file system stats, skipping them all"
                )

        # ####################### JOB METHODS ########################
        if (self.jobs and self.job_methods):
            # store all jobs grouped by jobID
            try:
                self.job_methods.get_all_jobs()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting jobs, skipping them all")

        if (self.job_logs and self.job_methods):
            # store all job logs per job session instance
            try:
                self.job_methods.job_logs()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting job logs, skipping them all"
                )

        # ####################### SSH METHODS ########################
        if (self.ssh and self.ssh_methods):
            # execute ssh statements for, VSNAP, VADP, other ssh hosts
            # store all job logs per job session instance
            try:
                self.ssh_methods.ssh()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when excecuting ssh commands, skipping them all"
                )

        if (self.process_stats and self.ssh_methods):
            # execute process stats for server
            try:
                self.ssh_methods.process_stats()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when excecuting ssh process statistic commands, skipping them all"
                )

        # ################### HYPERVISOR METHODS #####################
        if (self.vms and self.hypervisor_methods):
            try:
                self.hypervisor_methods.store_vms()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting all VMs, skipping them all"
                )

        if (self.sla_stats and self.hypervisor_methods):
            # number of VMs per SLA and sla dumps
            try:
                self.hypervisor_methods.vms_per_sla()
                self.hypervisor_methods.sla_dumps()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting and computing VMs per sla, skipping them all"
                )

        if (self.vm_stats and self.hypervisor_methods):
            # retrieve and calculate VM inventory summary
            try:
                self.hypervisor_methods.create_inventory_summary()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when creating inventory summary, skipping them all"
                )

        if (self.vadps and self.hypervisor_methods):
            try:
                self.hypervisor_methods.vadps()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when requesting vadps, skipping them all")

        if (self.storages and self.hypervisor_methods):
            try:
                self.hypervisor_methods.storages()
                self.influx_client.flush_insert_buffer()
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when collecting storages, skipping them all"
                )

        # ###################### OTHER METHODS #######################

        if (OPTIONS.create_dashboard):
            try:
                OtherMethods.create_dashboard(
                    dashboard_folder_path=OPTIONS.dashboard_folder_path,
                    database_name=self.influx_client.database.name)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message="Top-level-error when creating dashboards")

        # ######################   DISCLAMER   #######################
        # ###################  TEMPORARY FEATURE  ####################
        # this part is deleted once all old versions of SPPMon have been migrated
        # use at own caution
        # ############################################################
        if (OPTIONS.transfer_data):
            try:
                self.influx_client.transfer_data(OPTIONS.old_database)
            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Top-level-error when transfering data storages.")

        self.exit()
Example #5
0
    def query_url(
        self,
        url: str,
        params: Dict[str, Any] = None,
        request_type: RequestType = RequestType.GET,
        post_data: Dict[str, str] = None,
        auth: HTTPBasicAuth = None) -> Tuple[Dict[str, Any], float]:
        """Sends a request to this endpoint. Repeats if timeout error occured. Adust the pagesize on timeout.

        Arguments:
            url {str} -- URL to be queried. Must contain the server-uri and Endpoint. Does not allow encoded parameters
            post_data {str} -- additional data with filters/parameters. Only to be send with a POST-Request (default: {None})
            auth {HTTPBasicAuth} -- Basic auth to be used to login into SPP via POST-Request(default: {None})
            type {RequestType} -- What kind of Request should be made, defaults to GET

        Raises:
            ValueError: No URL specified
            ValueError: Error when requesting endpoint
            ValueError: Wrong status code
            ValueError: failed to parse result
            ValueError: Timeout when sending result
            ValueError: No post-data/auth is allowed in a GET-Request

        Returns:
            Tuple[Dict[str, Any], float] -- Result of the request with the required send time
        """
        if(not url):
            raise ValueError("no url specified")
        if((post_data or auth) and request_type == RequestType.GET):
            raise ValueError("No post-data/auth is allowed in a GET-Request")
        LOGGER.debug(f"query url: {url}, type: {type}, post_data: {post_data} auth: {True if auth else False}")
        if(not params):
            params = {}

        failed_tries: int = 0
        response_query: Optional[Response] = None
        send_time: float = -1 # prevent unbound var

        # avoid unset pageSize to not get into SPP defaults
        if("pageSize" not in params):
            LOGGER.debug(f"setting pageSize to {self.__page_size} from unset value")
            params["pageSize"] = self.__page_size
        elif(params["pageSize"] is None):
            params.pop("pageSize")

        while(response_query is None):

            # send the query
            try:
                if(request_type == RequestType.GET):
                    response_query = get(
                        url=url, headers=self.__headers, verify=False,
                        params=params,
                        timeout=(self.__initial_connection_timeout, self.__timeout))
                elif(request_type == RequestType.POST):
                    response_query = post(
                        url=url, headers=self.__headers, verify=False,
                        params=params, json=post_data, auth=auth,
                        timeout=(self.__initial_connection_timeout, self.__timeout))
                send_time = response_query.elapsed.total_seconds()

            except ReadTimeout as timeout_error:

                # timeout occured, increasing failed trys
                failed_tries += 1

                url_params = ConnectionUtils.get_url_params(url)


                # #### Aborting cases ######
                if(failed_tries > self.__max_send_retries):
                    ExceptionUtils.exception_info(error=timeout_error)
                    # read start index for debugging
                    start_index = url_params.get("pageStartIndex", None)
                    page_size = url_params.get("pageSize", None)
                    # report timeout with full information
                    raise ValueError("timeout after repeating a maximum ammount of times.",
                                     timeout_error, failed_tries, page_size, start_index)

                if(self.__page_size == self.__min_page_size):
                    ExceptionUtils.exception_info(error=timeout_error)
                    # read start index for debugging
                    start_index = url_params.get("pageStartIndex", None)
                    page_size = url_params.get("pageSize", None)
                    # report timeout with full information
                    raise ValueError("timeout after using minumum pagesize. repeating the request is of no use.",
                                     timeout_error, failed_tries, page_size, start_index)

                # #### continuing cases ######
                if(failed_tries == self.__max_send_retries): # last try
                    LOGGER.debug(f"Timeout error when requesting, now last try of total {self.__max_send_retries}. Reducing pagesize to minimum for url: {url}")
                    if(self.__verbose):
                        LOGGER.info(f"Timeout error when requesting, now last try of total {self.__max_send_retries}. Reducing pagesize to minimum for url: {url}")

                    # persist reduced size for further requests
                    self.__page_size = self.__min_page_size
                    # repeat with minimal possible size
                    LOGGER.debug(f"setting pageSize from {params.get('pageSize', None)} to {self.__page_size}")
                    params["pageSize"] = self.__page_size

                else: # (failed_tries < self.__max_send_retries): # more then 1 try left
                    LOGGER.debug(f"Timeout error when requesting, now on try {failed_tries} of {self.__max_send_retries}. Reducing pagesizefor url: {url}")
                    if(self.__verbose):
                        LOGGER.info(f"Timeout error when requesting, now on try {failed_tries} of {self.__max_send_retries}. Reducing pagesize for url: {url}")

                    # persist reduced size for further requests
                    self.__page_size = ConnectionUtils.adjust_page_size(
                        page_size=params["pageSize"],
                        min_page_size=self.__min_page_size,
                        timeout=True)
                    # repeat with reduced page size
                    LOGGER.debug(f"setting pageSize from {params.get('pageSize', None)} to {self.__page_size}")
                    params["pageSize"] = self.__page_size

            except RequestException as error:
                ExceptionUtils.exception_info(error=error)
                raise ValueError("error when requesting endpoint", error)

        if( not response_query.ok):
            raise ConnectionUtils.rest_response_error( response_query,
            "Wrong Status code when requesting endpoint data",
            url)

        try:
            response_json: Dict[str, Any] = response_query.json()
        except (json.decoder.JSONDecodeError, ValueError) as error:
            raise ValueError("failed to parse query in restAPI request", response_query)

        return (response_json, send_time)
Example #6
0
    def _parse_pool_show_cmd(
            ssh_command: SshCommand,
            ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]:
        """Parses the result of the `vsnap --json pool show` command, splitting it into its parts.

        Arguments:
            ssh_command {SshCommand} -- command with saved result
            ssh_type {SshTypes} -- type of the client

        Raises:
            ValueError: no command given or no result saved
            ValueError: no ssh type given

        Returns:
            Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list
        """
        if (not ssh_command or not ssh_command.result):
            raise ValueError("no command given or empty result")
        if (not ssh_type):
            raise ValueError("no sshtype given")

        pool_result_list: List[Dict[str, Any]] = []

        try:
            result: Dict[str, List[Dict[str,
                                        Any]]] = json.loads(ssh_command.result)
        except json.decoder.JSONDecodeError:  # type: ignore
            raise ValueError("cant decode json for pool command",
                             ssh_command.result, ssh_command, ssh_type)

        for pool in result['pools']:

            pool_dict: Dict[str, Any] = {}

            # acts as white list
            insert_list = [
                'compression', 'compression_ratio', 'deduplication',
                'deduplication_ratio', 'diskgroup_size', 'encryption.enabled',
                'health', 'id', 'name', 'pool_type', 'size_before_compression',
                'size_before_deduplication', 'size_free', 'size_total',
                'size_used', 'status'
            ]
            for item in insert_list:
                (key, value) = SppUtils.get_nested_kv(item, pool)
                pool_dict[key] = value

            # rename
            pool_dict['encryption_enabled'] = pool_dict.pop('enabled')

            # change unit from bytes to megabytes
            try:
                sz_b_c = SppUtils.parse_unit(
                    pool_dict['size_before_compression'])
                sz_b_d = SppUtils.parse_unit(
                    pool_dict['size_before_deduplication'])
                sz_fr = SppUtils.parse_unit(pool_dict['size_free'])
                sz_t = SppUtils.parse_unit(pool_dict['size_total'])
                sz_u = SppUtils.parse_unit(pool_dict['size_used'])

                pool_dict['size_before_compression'] = int(
                    sz_b_c / pow(2, 20)) if sz_b_c else None
                pool_dict['size_before_deduplication'] = int(
                    sz_b_d / pow(2, 20)) if sz_b_d else None
                pool_dict['size_free'] = int(sz_fr /
                                             pow(2, 20)) if sz_fr else None
                pool_dict['size_total'] = int(sz_t /
                                              pow(2, 20)) if sz_t else None
                pool_dict['size_used'] = int(sz_u /
                                             pow(2, 20)) if sz_u else None
            except KeyError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    f"failed to reduce size of vsnap pool size for {pool_dict}"
                )

            # set default needed fields
            pool_dict['hostName'] = ssh_command.host_name
            pool_dict['ssh_type'] = ssh_type.name

            pool_result_list.append(pool_dict)

        return (ssh_command.table_name, pool_result_list)
    def ssh_execute_commands(
        cls, ssh_clients: List[SshClient], ssh_type: SshTypes,
        command_list: List[SshCommand]
    ) -> List[Tuple[str, List[Dict[str, Any]]]]:
        """
        functions executes commands via ssh on several hosts.
        the hosts (other, vsnap, vadp) can be defined in the JSON configuation file
        commands which shall be executed on vsnap and / or vadp proxies in the dedicated ist of strings.
        'otherCommands' is a list of commands which are executed on hosts which are not of type: vsnap | vadp.

        if any host are not reachable, they are skipped
        """

        if (not command_list):
            LOGGER.debug("No commands specified, aborting command.")
            if (cls.verbose):
                LOGGER.info("No commands specified, aborting command.")
            return []

        client_list = list(
            filter(lambda client: client.client_type is ssh_type, ssh_clients))
        if (not client_list):
            LOGGER.debug(
                f"No {ssh_type.name} ssh client present. Aborting command")
            if (cls.verbose):
                LOGGER.info(
                    f"No {ssh_type.name} ssh client present. Aborting command")
            return []

        # List to persist ssh-result stats over each client
        ssh_cmd_response_list: List[Dict[str, Union[str, int, None]]] = []
        # list to insert into influx, tuple of table and its result-lists
        result_list: List[Tuple[str, List[Dict[str, Any]]]] = []
        for client in client_list:

            if (cls.verbose):
                LOGGER.info(
                    f">> executing {ssh_type.name} command(s) on host {client.host_name}"
                )

            try:
                result_commands = client.execute_commands(
                    commands=command_list, verbose=cls.verbose)

            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "Error when executing commands, skipping this client")
                continue

            for ssh_command in result_commands:
                # generate stats for the ssh-stats list
                insert_dict: Dict[str, Union[str, int, None]] = {}
                insert_dict["host"] = ssh_command.host_name
                insert_dict["command"] = ssh_command.cmd
                insert_dict["output"] = json.dumps(ssh_command.result)
                insert_dict['ssh_type'] = ssh_type.name
                time_key, time_value = SppUtils.get_capture_timestamp_sec()
                insert_dict[time_key] = time_value

                ssh_cmd_response_list.append(insert_dict)

                # execute the command
                try:
                    table_result_tuple = ssh_command.parse_result(
                        ssh_type=ssh_type)
                    if (table_result_tuple):
                        # save the command into the result set wit its table
                        result_list.append(table_result_tuple)
                except ValueError as error:
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message=
                        "Error when parsing result, skipping parsing of this result"
                    )

        # append the ssh command once, cause each client already added into the ssh_command list
        result_list.append(("sshCmdResponse", ssh_cmd_response_list))
        return result_list
Example #8
0
    def insert_dicts_to_buffer(self, table_name: str,
                               list_with_dicts: List[Dict[str, Any]]) -> None:
        """Insert a list of dicts with data into influxdb. Splits according to table definition.

        It is highly recommened to define a table before in database_table.py. If not present, splits by type analysis.
        Important: Querys are only buffered, not sent. Call flush_insert_buffer to flush.

        Arguments:
            table_name {str} -- Name of the table to be inserted
            list_with_dicts {List[Dict[str, Any]]} -- List with dicts whith collum name as key.

        Raises:
            ValueError: No list with dictonarys are given or of wrong type.
            ValueError: No table name is given
        """
        LOGGER.debug(f"Enter insert_dicts for table: {table_name}")
        if (list_with_dicts is None):  # empty list is allowed
            raise ValueError("missing list with dictonarys in insert")
        if (not table_name):
            raise ValueError("table name needs to be set in insert")

        # Only insert of something is there to insert
        if (not list_with_dicts):
            LOGGER.debug("nothing to insert for table %s due empty list",
                         table_name)
            return

        # get table instance
        table = self.database[table_name]

        # Generate querys for each dict
        query_buffer = []
        for mydict in list_with_dicts:
            try:
                # split dict according to default tables
                (tags, values,
                 timestamp) = table.split_by_table_def(mydict=mydict)

                if (isinstance(timestamp, str)):
                    timestamp = int(timestamp)
                # LOGGER.debug("%d %s %s %d",appendCount,tags,values,timestamp)

                # create query and append to query_buffer
                query_buffer.append(InsertQuery(table, values, tags,
                                                timestamp))
            except ValueError as err:
                ExceptionUtils.exception_info(
                    error=err, extra_message="skipping single dict to insert")
                continue

        # extend existing inserts by new one and add to insert_buffer
        table_buffer = self.__insert_buffer.get(table, list())
        table_buffer.extend(query_buffer)
        self.__insert_buffer[table] = table_buffer
        LOGGER.debug("Appended %d items to the insert buffer",
                     len(query_buffer))

        # safeguard to avoid memoryError
        if (len(self.__insert_buffer[table]) >
                2 * self.__query_max_batch_size):
            self.flush_insert_buffer()

        LOGGER.debug(f"Exit insert_dicts for table: {table_name}")
Example #9
0
    def flush_insert_buffer(self, fallback: bool = False) -> None:
        """Flushes the insert buffer, send querys to influxdb server.

        Sends in batches defined by `__batch_size` to reduce http overhead.
        Only send-statistics remain in buffer, flush again to send those too.
        Retries once into fallback mode if first request fails with modified settings.

        Keyword Arguments:
            fallback {bool} -- Whether to use fallback-options. Does not repeat on fallback (default: {False})

        Raises:
            ValueError: Critical: The query Buffer is None.
        """

        if (self.__insert_buffer is None):
            raise ValueError(
                "query buffer is somehow None, this should never happen!")
        # Only send if there is something to send
        if (not self.__insert_buffer):
            return

        # pre-save the keys to avoid Runtime-Error due "dictionary keys changed during iteration"
        # happens due re-run changing insert_buffer
        insert_keys = list(self.__insert_buffer.keys())
        for table in insert_keys:
            # get empty in case the key isnt valid anymore (due fallback option)
            queries = list(
                map(lambda query: query.to_query(),
                    self.__insert_buffer.get(table, [])))
            item_count = len(queries)
            if (item_count == 0):
                continue

            # stop time for send progess
            if (not fallback):
                batch_size = self.__query_max_batch_size
            else:
                batch_size = self.__fallback_max_batch_size

            re_send: bool = False
            error_msg: Optional[str] = None
            start_time = time.perf_counter()
            try:
                self.__client.write_points(
                    points=queries,
                    database=self.database.name,
                    retention_policy=table.retention_policy.name,
                    batch_size=batch_size,
                    time_precision='s',
                    protocol='line')
                end_time = time.perf_counter()
            except InfluxDBClientError as error:  # type: ignore
                match = re.match(r".*partial write:[\s\w]+=(\d+).*",
                                 error.content)

                if (match and int(match.group(1)) < batch_size):
                    # beyond 10.000 everything will be lost, below still written
                    # ignore this case, its unavoidable and doesnt change anything
                    pass
                elif (re.match(r".*partial write: unable to parse .*",
                               error.content)):
                    # some messages are lost, other written
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message=
                        f"Some messages were lost when sending buffer for table {table.name}, but everything else should be OK"
                    )
                    error_msg = getattr(error, 'message', repr(error))
                else:
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message=
                        f"Client error when sending insert buffer for table {table.name}."
                    )
                    error_msg = getattr(error, 'message', repr(error))
                    # re-try with a smaller batch size, unsure if this helps
                    re_send = True

            except (InfluxDBServerError, ConnectionError,
                    requests.exceptions.ConnectionError
                    ) as error:  # type: ignore
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    f"Connection error when sending insert buffer for table {table.name}."
                )
                error_msg = getattr(error, 'message', repr(error))
                re_send = True

            # measure timing
            end_time = time.perf_counter()

            # clear the table which just got sent
            if (re_send and not fallback):
                ExceptionUtils.error_message(
                    "Trying to send influx buffer again with fallback options")
                self.flush_insert_buffer(fallback=True)

            # None to avoid key erro if table is popped on fallback
            self.__insert_buffer.pop(table, None)

            # add metrics for the next sending process.
            # compute duration, metrics computed per batch
            self.__insert_metrics_to_buffer(Keyword.INSERT,
                                            table,
                                            end_time - start_time,
                                            item_count,
                                            error=error_msg)
Example #10
0
    def check_grant_user(self, username: str, permission: str):
        """Checks and Grants the permissions for a user to match at least the required permission or a higher one.

        Warns if user does not exists. Grants permission if current permissions to not fullfil the requirement.
        This method does not abort if the check or grant was unsuccessfull!

        Args:
            username (str): name of the user to be checked
            permission (str): permissions to be granted: READ, WRITE, ALL

        Raises:
            ValueError: No username provided
            ValueError: no permissions provided
        """
        try:
            LOGGER.debug(
                f"Checking/Granting user {username} for {permission} permissions on db {self.database.name}."
            )
            if (not username):
                raise ValueError(
                    "checking/granting a user permissions require an username")
            if (not permission):
                raise ValueError(
                    "checking/granting a user permissions require a defined set of permissions"
                )

            # Get all users to check for the required user
            user_list: List[Dict[str, Union[
                str, bool]]] = self.__client.get_list_users()
            LOGGER.debug(f"Returned list of users: {user_list}")

            # get the wanted user if it exists. Default value to not throw an error.
            user_dict = next(
                filter(lambda user_dict: user_dict['user'] == username,
                       user_list), None)
            LOGGER.debug(f"Found user: {user_dict}")

            # SPPMon should not create a user since then a default password will be used
            # It is very unlikely that this one is getting changed and therefore a risk of leaking data.
            if (not user_dict):
                ExceptionUtils.error_message(
                    f"The user '{username}' does not exist. Please create it according to the documentation."
                )
                return  # not abort SPPMon, only minor error

            if (user_dict['admin']):
                LOGGER.debug(f"{username} is already admin. Finished check")
                return

            # get privileges of user to check if he already has matching permissions
            db_privileges: List[Dict[
                str, str]] = self.__client.get_list_privileges(username)
            LOGGER.debug(db_privileges)

            # check for existing privileges
            db_entry = next(
                filter(
                    lambda entry_dict: entry_dict['database'] == self.database.
                    name, db_privileges), None)
            # there must be permissions of either wanted permission or higher (all)
            if (db_entry and (db_entry['privilege'] == permission
                              or db_entry['privilege'] == "ALL")):
                LOGGER.debug(
                    f"{username} has already correct permissions. Finished check"
                )
                return

            # else give permissions
            LOGGER.info(
                f"Permissions missing for user {username}, granting {permission} permissions."
            )
            self.__client.grant_privilege(permission, self.database.name,
                                          username)

            LOGGER.debug(f"Granted permissions to {username}")

        except (ValueError, InfluxDBClientError, InfluxDBServerError,
                requests.exceptions.ConnectionError) as error:  # type: ignore
            ExceptionUtils.exception_info(
                error=error,
                extra_message=
                "User check failed for user {username} with permissions {permission} on db {self.database.name}"
            )  # type: ignore
Example #11
0
    def copy_database(self, new_database_name: str) -> None:
        if (not new_database_name):
            raise ValueError(
                "copy_database requires a new database name to copy to.")

        # Programm information
        LOGGER.info(
            f"Copy Database: transfering the data from database {self.database.name} into {new_database_name}."
        )
        LOGGER.info(
            "> Info: This also includes all data from `autogen` retention policy, sorted into the correct retention policies."
        )

        # create db, nothing happens if it already exists
        LOGGER.info("> Creating the new database if it didn't already exist")
        self.setup_db(new_database_name)

        # check for exisiting retention policies and continuous queries in the influxdb
        LOGGER.info(
            ">> Checking and creating retention policies for the new database. Ignoring continuous queries."
        )
        self.check_create_rp(new_database_name)
        # self.check_create_cq() # Note: Not possible due full qualified statements. this would also not truly conserve the data

        LOGGER.info("> Computing queries to be send to the server.")
        queries: List[str] = []
        # copies all tables into their respective duplicate, data over RP-time will be dropped.
        for table in self.database.tables.values():
            autogen_query_str = f"SELECT * INTO {new_database_name}.{table.retention_policy.name}.{table.name} FROM {table.database.name}.autogen.{table.name} WHERE time > now() - {table.retention_policy.duration} GROUP BY *"
            queries.append(autogen_query_str)

            rp_query_str = f"SELECT * INTO {new_database_name}.{table.retention_policy.name}.{table.name} FROM {table} WHERE time > now() - {table.retention_policy.duration} GROUP BY *"
            queries.append(rp_query_str)

        # Compute data with a timestamp over the initial RP-duration into other RP's.
        for con_query in self.database.continuous_queries:
            cq_query_str: str = con_query.to_query()

            # replacing the rp inside of the toString representation
            # this is easier than individual matching/code replacement
            # Not every database name should be replaced
            match = re.search(
                r"BEGIN(.*(INTO\s+(.+)\..+\..+)\s+(FROM\s+\w+\.(\w+)\.\w+)(?:\s+WHERE\s+(.+))?\s+GROUP BY.*)END",
                cq_query_str)
            if (not match):
                raise ValueError(
                    f">> error when matching continous query {cq_query_str}. Aborting."
                )

            full_match = match.group(1)
            into_clause = match.group(2)
            old_database_str = match.group(3)
            from_clause = match.group(4)
            from_rp = match.group(5)
            where_clause = match.group(6)

            # Add timelimit in where clause to prevent massive truncation due the rentention-policy time limit
            new_full_match = full_match
            if (not con_query.select_query
                    or con_query.select_query.into_table is None):
                ExceptionUtils.error_message(
                    f">> Into table of continous query is none. Adjust query manually! {full_match}"
                )
            elif (con_query.select_query.into_table.retention_policy.duration
                  != '0s'):
                # Caution: if truncation of a query is above 10.000 it won't be saved!
                clause = f"time > now() - {con_query.select_query.into_table.retention_policy.duration}"
                if (where_clause):
                    new_full_match = new_full_match.replace(
                        where_clause, where_clause + " AND " + clause)
                else:
                    new_full_match = new_full_match.replace(
                        from_clause, from_clause + " WHERE " + clause)

            # replace old dbname with new one
            new_into_clause = into_clause.replace(old_database_str,
                                                  new_database_name)
            new_full_match = new_full_match.replace(into_clause,
                                                    new_into_clause)

            # case 1: keep retention policy
            queries.append(new_full_match)

            # case 2: autogen as from RP
            new_from_clause = from_clause.replace(from_rp, "autogen")
            auto_gen_match = new_full_match.replace(from_clause,
                                                    new_from_clause)
            queries.append(auto_gen_match)

        LOGGER.info("> Finished Computing, starting to send.")

        # how many lines were transfered
        line_count: int = 0
        # how often was a query partially written, not line count!
        dropped_count: int = 0
        # how often was data dropped above the 10.000 limit?
        critical_drop: int = 0

        # print statistics
        # send time since last print
        send_time_collection: float = 0
        # line count since last print
        line_collection: int = 0

        # disable timeout
        old_timeout = self.__client._timeout
        self.__client = InfluxDBClient(  # type: ignore
            host=self.__address,
            port=self.__port,
            username=self.__user,
            password=self.__password,
            ssl=self.__use_ssl,
            verify_ssl=self.__verify_ssl,
            timeout=7200)
        # ping to make sure connection works
        version: str = self.__client.ping()
        LOGGER.info(
            f">> Connected to influxdb with new timeout of {self.__client._timeout}, version: {version}"
        )
        LOGGER.info(">> Starting transfer of data")
        i = 0

        for query in queries:
            try:
                start_time = time.perf_counter()
                # seems like you may only send one SELECT INTO at once via python
                result = self.__client.query(  # type: ignore
                    query=query,
                    epoch='s',
                    database=self.database.name)
                end_time = time.perf_counter()

                # count lines written, max 1
                for result in result.get_points():
                    i += 1
                    line_count += result["written"]

                    # print statistics
                    send_time_collection += end_time - start_time
                    line_collection += result["written"]

                    # Print only all 10 queries or if the collected send time is too high
                    if (i % 10 == 0 or send_time_collection >= 2):
                        LOGGER.info(
                            f'query {i}/{len(queries)}: {line_collection} new lines in {send_time_collection}s.'
                        )
                        line_collection = 0
                        send_time_collection = 0

            except InfluxDBClientError as error:
                # only raise if the error is unexpected
                if (re.search(
                        f"partial write: points beyond retention policy dropped=10000",
                        error.content)):
                    critical_drop += 1
                    raise ValueError(
                        ">> transfer of data failed, retry manually with a shorter WHERE-clause",
                        query)
                if (re.search(
                        f"partial write: points beyond retention policy dropped=",
                        error.content)):
                    dropped_count += 1
                else:
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message=
                        f">> transfer of data failed for query {query}")
                    critical_drop += 1

            except (InfluxDBServerError,
                    requests.exceptions.ConnectionError) as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    f">> transfer of data failed for query {query}")
                critical_drop += 1

        # reset timeout
        self.__client = InfluxDBClient(  # type: ignore
            host=self.__address,
            port=self.__port,
            username=self.__user,
            password=self.__password,
            ssl=self.__use_ssl,
            verify_ssl=self.__verify_ssl,
            timeout=old_timeout)
        # ping to make sure connection works
        version: str = self.__client.ping()
        LOGGER.info(
            f">> Changed timeout of influxDB to old timeout of {self.__client._timeout}, version: {version}"
        )

        LOGGER.info(f"> Total transfered {line_count} lines of results.")
        if (dropped_count):
            LOGGER.info(
                f"> WARNING: Could not count lines of {dropped_count} queries due an expected error. No need for manual action."
            )
        if (critical_drop):
            msg: str = (
                f"ERROR: Could not transfer data of {critical_drop} tables, check messages above to retry manually!\n"
                +
                "Please send the query manually with a adjusted 'from table': '$database.autogen.tablename'\n "
                +
                "Adjust other values as required. Drop due Retention Policy is 'OK' until 10.000.\n"
                +
                "If the drop count reaches 10.000 you need to cut the query into smaller bits."
            )
            ExceptionUtils.error_message(msg)
        elif (line_count == 0):
            ExceptionUtils.error_message(
                "ERROR: No data was transferred, make sure your database name is correct and the db is not empty."
            )
        else:
            LOGGER.info("Database copied sucessfully")
Example #12
0
    def check_create_cq(self) -> None:
        """Checks if any continuous query needs to be altered or added

        Raises:
            ValueError: Check failed due Database error
        """
        try:
            # returns a list of dictonarys with db name as key
            # inside the dicts there is a list of each cq
            # the cqs are displayed as a 2 elem dict: 'name' and 'query'
            results: List[Dict[str, List[Dict[
                str, str]]]] = self.__client.get_list_continuous_queries()

            # get the cq's of the correct db
            # list of 2-elem cqs: 'name' and 'query'
            cq_result_list: List[Dict[str, str]] = next(
                (
                    cq.get(self.database.name, []) for cq in results
                    # only if matches the db name
                    if cq.get(self.database.name, False)),
                [])

            # save all results into a dict for quicker accessing afterwards
            cq_result_dict: Dict[str, str] = {}
            for cq_result in cq_result_list:
                cq_result_dict[cq_result['name']] = cq_result['query']

            # queries which need to be added
            add_cq_list: List[ContinuousQuery] = []
            # queries to be deleted (no alter possible): save name only
            drop_cq_list: List[str] = []

            # check for each cq if it needs to be 1. dropped and 2. added
            for continuous_query in self.database.continuous_queries:

                result_cq = cq_result_dict.get(continuous_query.name, None)
                if (result_cq is None):
                    add_cq_list.append(continuous_query)
                elif (result_cq != continuous_query.to_query()):
                    LOGGER.debug(f"result_cq: {result_cq}")
                    LOGGER.debug(f"desired_cq: {continuous_query.to_query()}")
                    # delete result cq and then add it new
                    # save name only
                    drop_cq_list.append(continuous_query.name)
                    add_cq_list.append(continuous_query)
                # else: all good

            LOGGER.debug(f"deleting {len(drop_cq_list)} CQ's: {drop_cq_list}")
            # alter not possible -> drop and readd
            for query_name in drop_cq_list:
                self.__client.drop_continuous_query(  # type: ignore
                    name=query_name,
                    database=self.database.name)

            # adding new / altered CQ's
            LOGGER.debug(
                f"adding {len(add_cq_list)} CQ's. adding {add_cq_list}")
            for continuous_query in add_cq_list:
                self.__client.create_continuous_query(  # type: ignore
                    name=continuous_query.name,
                    select=continuous_query.select,
                    database=continuous_query.database.name,
                    resample_opts=continuous_query.resample_opts)

        except (ValueError, InfluxDBClientError, InfluxDBServerError,
                requests.exceptions.ConnectionError) as error:  # type: ignore
            ExceptionUtils.exception_info(error=error)  # type: ignore
            raise ValueError("Continuous Query check failed")
Example #13
0
    def __init__(self, influx_client: InfluxClient, config_file: Dict[str, Any], verbose: bool = False):
        if(not config_file):
            raise ValueError("Require config file to setup ssh clients")
        if(not influx_client):
            raise ValueError("need InfluxClient to send data to DB")

        self.__influx_client = influx_client
        self.__verbose = verbose

        try:
            self.__ssh_clients = self.setup_ssh_clients(config_file)
        except ValueError as error:
            ExceptionUtils.exception_info(error)
            raise ValueError("No ssh-clients are present or error when reading config file. Skipping SSH-Methods creation")

        # ################################################################################################
        # ################################### SSH COMMAND LIST GROUPS ####################################
        # ################################################################################################
        # Add all required commands ONLY here. Format:
        # Always a list of `SshCommand`. Create a instance for each command needed.
        # group by type, while all will be executed for any type.
        # if you add new types also add them in the `SshTypes`-enum.
        # you can use the table name multiple times, just make sure you also define a according table in
        # `database_tables.py`.
        # After declaring here you may execute the command like the others below.

        # those commands are going to be executed on ANY client.
        self.__all_command_list = [
            SshCommand(
                command="mpstat",
                parse_function=SshMethods._parse_mpstat_cmd,
                table_name="ssh_mpstat_cmd"
            ),
            SshCommand(
                command="free",
                parse_function=SshMethods._parse_free_cmd,
                table_name="ssh_free_cmd"
            )
        ]

        # Those commands are only executed on the associated (key) client type
        self.__client_commands: Dict[SshTypes, List[SshCommand]] = {

            # SEVER
            SshTypes.SERVER: [
                # added later due function, check below

                SshCommand(
                    command='df -h / --block-size=G',
                    parse_function=SshMethods._parse_df_cmd,
                    table_name="df_ssh"
                ),
                SshCommand(
                    command='df -h /opt/IBM/SPP --block-size=G',
                    parse_function=SshMethods._parse_df_cmd,
                    table_name="df_ssh"
                ),
                ## df -h /
                ## df -h /opt/IBM/SPP
            ],

            # VSnap
            SshTypes.VSNAP:[
                SshCommand(
                    command='systemctl status vsnap-api.service > /dev/null && sudo vsnap --json pool show',
                    parse_function=SshMethods._parse_pool_show_cmd,
                    table_name="vsnap_pools"
                ),
                SshCommand(
                    command='systemctl status vsnap-api.service > /dev/null && sudo vsnap --json system stats',
                    parse_function=SshMethods._parse_system_stats_cmd,
                    table_name="vsnap_system_stats"
                ),
                SshCommand(
                    command='df -h / --block-size=G',
                    parse_function=SshMethods._parse_df_cmd,
                    table_name="df_ssh"
                ),
                ##  zpool list
                ##  df -h /
            ],

            # VADP
            SshTypes.VADP: [
                # nothing yet
            ],

            # CLOUDPROXY
            SshTypes.CLOUDPROXY: [
                # nothing yet
            ],

            # OTHER
            SshTypes.OTHER: [
                SshCommand(
                    command="df -h --block-size=G",
                    parse_function=SshMethods._parse_df_cmd,
                    table_name="df_ssh"
                )
            ]
        }

        # ################ MULTI COMMAND ADD ##########################

        # SERVER
        # add server later due multiple processes
        self.__process_grep_list = ["mongod", "beam.smp", "java"] # be aware this is double declared below
        for grep_name in self.__process_grep_list:
            self.__client_commands[SshTypes.SERVER].append(
                SshCommand(
                    command=f"ps -o \"%cpu,%mem,comm,rss,vsz,user,pid,etimes\" -p $(pgrep -d',' -f {grep_name}) S -ww",
                    parse_function=self._parse_ps_cmd,
                    table_name="processStats"
                )
            )
        # Top commands for CPU Only
        for grep_name in self.__process_grep_list:
            self.__client_commands[SshTypes.SERVER].append(
                SshCommand(
                    command=f"top -bs -w 512 -n1 -p $(pgrep -d',' -f {grep_name})",
                    parse_function=self._parse_top_cmd,
                    table_name="processStats"
                )
            )
Example #14
0
    def execute_commands(self,
                         commands: List[SshCommand],
                         verbose: bool = False) -> List[SshCommand]:
        """Executes given commands on this ssh client. Returns a new list of commands.

        Automatically connects and disconnects.

        Arguments:
            commands {List[SshCommand]} -- List of commands to be executed

        Keyword Arguments:
            verbose {bool} -- whether to print the result  (default: {False})

        Raises:
            ValueError: No list of commands given.
        """
        if (not commands or not isinstance(commands, list)):
            raise ValueError("Need list of commands to execute")

        LOGGER.debug(
            f"> connecting to {self.client_type.name} client on host {self.host_name}"
        )
        if (verbose):
            LOGGER.info(
                f"> connecting to {self.client_type.name} client on host {self.host_name}"
            )

        self.connect()

        LOGGER.debug("> connection successfull")
        if (verbose):
            LOGGER.info("> connection successfull")

        new_command_list: List[SshCommand] = []
        for ssh_command in commands:

            if (self.__skip_cmd(ssh_command)):
                LOGGER.info(
                    f"Skipped command {ssh_command.cmd} on host {self.host_name}"
                )
                continue

            try:
                LOGGER.debug(
                    f"Executing command {ssh_command.cmd} on host {self.host_name}"
                )
                result = self.__send_command(ssh_command.cmd)

                # save result
                new_command = ssh_command.save_result(result, self.host_name)
                LOGGER.debug(f"Command result: {result}")

            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    f"failed to execute command on host: {self.host_name}, skipping it: {ssh_command.cmd}"
                )

                # make sure it is not set
                new_command = ssh_command.save_result(result=None,
                                                      host_name=self.host_name)
            new_command_list.append(new_command)

        self.disconnect()

        return new_command_list
Example #15
0
    def test_connection(influx_client: InfluxClient,
                        rest_client: Optional[RestClient],
                        config_file: Dict[str, Any]):
        if (not config_file):
            raise ValueError("SPPmon does not work without a config file")

        LOGGER.info("Testing all connections required for SPPMon to work")
        working: bool = True  # SPPMon itself will finish sucessfull (no critical errors)
        no_warnings: bool = True  # SPPMon will finish without any warnings (no errors at all)

        # ## InfluxDB ##

        LOGGER.info("> Testing and configuring InfluxDB")
        try:
            influx_client.connect()
            influx_client.disconnect()
            if (not influx_client.use_ssl):
                ExceptionUtils.error_message(
                    "> WARNING: Mandatory SSL is disabled. We hightly recommend to enable it!"
                )
                no_warnings = False

            LOGGER.info("InfluxDB is ready for use")
        except ValueError as error:
            ExceptionUtils.exception_info(
                error,
                extra_message=
                "> Testing of the InfluxDB failed. This is a crictial component of SPPMon."
            )
            working = False

        # ## REST-API ##

        LOGGER.info("> Testing REST-API of SPP.")
        try:
            if (not rest_client):
                raise ValueError(
                    "Rest-client is setup. Unavailable to test it.")
            rest_client.login()
            (version_nr, build_nr) = rest_client.get_spp_version_build()
            LOGGER.info(
                f">> Sucessfully connected to SPP V{version_nr}, build {build_nr}."
            )
            rest_client.logout()
            LOGGER.info("> REST-API is ready for use")
        except ValueError as error:
            ExceptionUtils.exception_info(
                error,
                extra_message=
                "> Testing of the REST-API failed. This is a crictial component of SPPMon."
            )
            working = False

        # ## SSH-CLIENTS ##

        LOGGER.info(
            "> Testing all types of SSH-Clients: Server, VAPDs, vSnaps, Cloudproxy and others"
        )
        ssh_working = True  # The arg --ssh will finish without any error at all

        # Count of clients checks
        ssh_clients: List[SshClient] = SshMethods.setup_ssh_clients(
            config_file)
        if (not ssh_clients):
            ExceptionUtils.error_message(
                ">> No SSH-clients detected at all. At least the server itself should be added for process-statistics."
            )
            ssh_working = False
        else:
            for type in SshTypes:
                if (not list(
                        filter(lambda client: client.client_type == type,
                               ssh_clients))):
                    LOGGER.info(f">> No {type.name} client detected.")

                    if (type == SshTypes.SERVER):
                        ExceptionUtils.error_message(
                            ">> Critical: Without Server as ssh client you wont have any process statistics available. These are a key part of SPPMon."
                        )
                        ssh_working = False  # No error, but still critical

                    if (type == SshTypes.VSNAP):
                        LOGGER.info(
                            ">> WARNING: Without vSnap as ssh client you have no access to storage information. You may add vSnap's for additional monitoring and alerts."
                        )
                        no_warnings = False  # ssh will still work, but thats definitly a warning

            ssh_methods: SshMethods = SshMethods(influx_client, config_file,
                                                 False)
            # Connection check
            LOGGER.info(
                f">> Testing now connection and commands of {len(ssh_clients)} registered ssh-clients."
            )
            for client in ssh_clients:
                try:
                    client.connect()
                    client.disconnect()

                    error_count: int = len(ExceptionUtils.stored_errors)
                    MethodUtils.ssh_execute_commands(
                        ssh_clients=[client],
                        ssh_type=client.client_type,
                        command_list=ssh_methods.client_commands[
                            client.client_type] + ssh_methods.all_command_list)
                    if (len(ExceptionUtils.stored_errors) != error_count):
                        ssh_working = False
                        ExceptionUtils.error_message(
                            f"Not all commands available for client {client.host_name} with type: {client.client_type}.\n"
                            +
                            "Please check manually if the commands are installed and their output."
                        )

                except ValueError as error:
                    ExceptionUtils.exception_info(
                        error,
                        extra_message=
                        f"Connection failed for client {client.host_name} with type: {client.client_type}."
                    )
                    ssh_working = False

        if (ssh_working):
            LOGGER.info("> Testing of SSH-clients sucessfull.")
        else:
            LOGGER.info(
                "> Testing of SSH-clients failed! SPPMon will still work, not all informations are available."
            )
            no_warnings = False

        # #### Conclusion ####

        if (working and no_warnings):
            LOGGER.info(
                "> All components tested sucessfully. SPPMon is ready to be used!"
            )
        elif (working):
            LOGGER.info(
                "> Testing partially sucessful. SPPMon will run, but please check the warnings."
            )
        else:
            LOGGER.info(
                "> Testing failed. SPPMon is not ready to be used. Please fix the connection issues."
            )
    def set_optional_configs(self, config_file: Dict[str, Any]) -> None:
        """Sets up any optional infrastructure, to be called within the init.

        Be aware not everything may be initalized on call time.
        Add config here if the system should not abort if it is missing.

        Arguments:
            config_file {Dict[str, Any]} -- Opened Config file
        """

        if (not config_file):
            ExceptionUtils.error_message(
                "missing or empty config file, aborting.")
            self.exit(error_code=ERROR_CODE_CMD_LINE)

        # ############################ REST-API #####################################
        try:
            auth_rest = SppUtils.get_cfg_params(param_dict=config_file,
                                                param_name="sppServer")

            if (not isinstance(auth_rest, dict)):
                raise ValueError("sppServer config need to be dict")

            self.job_log_retention_time = auth_rest.get(
                "jobLog_rentation", "60d")

            ConnectionUtils.verbose = OPTIONS.verbose
            # ### Loaded Systems part 1/2 ### #
            if (OPTIONS.minimumLogs or OPTIONS.loadedSystem):
                # Setting pagesize scaling settings
                ConnectionUtils.timeout_reduction = self.loaded_timeout_reduction
                ConnectionUtils.allowed_send_delta = self.loaded_allowed_send_delta
                ConnectionUtils.max_scaling_factor = self.loaded_max_scaling_factor

                # Setting RestClient request settings.
                self.rest_client = RestClient(
                    auth_rest=auth_rest,
                    pref_send_time=self.loaded_pref_send_time,
                    request_timeout=self.loaded_request_timeout,
                    send_retries=self.loaded_send_retries,
                    starting_page_size=self.loaded_starting_page_size,
                    min_page_size=self.loaded_min_page_size,
                    verbose=OPTIONS.verbose)
            else:
                ConnectionUtils.timeout_reduction = self.timeout_reduction
                ConnectionUtils.allowed_send_delta = self.allowed_send_delta
                ConnectionUtils.max_scaling_factor = self.max_scaling_factor

                # Setting RestClient request settings.
                self.rest_client = RestClient(
                    auth_rest=auth_rest,
                    pref_send_time=self.pref_send_time,
                    request_timeout=self.request_timeout,
                    send_retries=self.send_retries,
                    starting_page_size=self.starting_page_size,
                    min_page_size=self.min_page_size,
                    verbose=OPTIONS.verbose)

            self.api_queries = ApiQueries(self.rest_client)
            self.rest_client.login()

        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message="REST-API is not available due Config error")
            self.rest_client = None
            self.api_queries = None

        # ######################## System, Job and Hypervisor Methods ##################
        try:
            # explicit ahead due dependency
            self.system_methods = SystemMethods(self.influx_client,
                                                self.api_queries,
                                                OPTIONS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        # ### Loaded Systems part 2/2 ### #
        if (OPTIONS.minimumLogs or OPTIONS.loadedSystem):
            given_log_types = self.loaded_joblog_types
        else:
            given_log_types = self.joblog_types

        try:
            self.job_methods = JobMethods(self.influx_client, self.api_queries,
                                          self.job_log_retention_time,
                                          given_log_types, OPTIONS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        try:
            # dependen on system methods
            self.hypervisor_methods = ProtectionMethods(
                self.system_methods, self.influx_client, self.api_queries,
                OPTIONS.verbose)
        except ValueError as error:
            ExceptionUtils.exception_info(error=error)

        # ############################### SSH #####################################
        if (self.ssh or self.process_stats):
            try:

                auth_ssh = SppUtils.get_cfg_params(param_dict=config_file,
                                                   param_name="sshclients")

                ssh_clients: List[SshClient] = []
                if (not isinstance(auth_ssh, list)):
                    raise ValueError("not a list of sshconfig given", auth_ssh)

                for client_ssh in auth_ssh:
                    try:
                        ssh_clients.append(SshClient(client_ssh))
                    except ValueError as error:
                        ExceptionUtils.exception_info(
                            error=error,
                            extra_message=
                            f"Setting up one client failed, skipping it. Client: \
                            {client_ssh.get('name', 'ERROR WHEN GETTING NAME')}"
                        )

                # set from None to methods once finished
                self.ssh_methods = SshMethods(influx_client=self.influx_client,
                                              ssh_clients=ssh_clients,
                                              verbose=OPTIONS.verbose)

            except ValueError as error:
                ExceptionUtils.exception_info(
                    error=error,
                    extra_message=
                    "SSH-Commands are not available due Config error")
    def create_inventory_summary(self) -> None:
        """Retrieves and calculate VM inventory summary by influx catalog data."""

        LOGGER.info(
            "> computing inventory information (not from catalog, means not only backup data is calculated)")

        # ########## Part 1: Check if something need to be computed #############
        # query the timestamp of the last vm, commited as a field is always needed by influx rules.
        vms_table = self.__influx_client.database["vms"]

        time_query = SelectionQuery(
            keyword=Keyword.SELECT,
            tables=[vms_table],
            fields=['time', 'commited'],
            limit=1,
            order_direction="DESC"
        )
        result = self.__influx_client.send_selection_query(time_query) # type: ignore
        last_vm: Dict[str, Any] = next(result.get_points(), None) # type: ignore

        if(not last_vm):
            raise ValueError("no VM's stored, either none are available or you have to store vm's first")

        # query the last vm stats to compare timestamps with last vm
        last_time_ms: int = last_vm["time"]
        last_time = SppUtils.to_epoch_secs(last_time_ms)
        where_str = "time = {}s".format(last_time)

        vm_stats_table = self.__influx_client.database["vmStats"]

        vm_stats_query = SelectionQuery(
            keyword=Keyword.SELECT,
            tables=[vm_stats_table],
            fields=['*'],
            where_str=where_str,
            limit=1
        )
        result = self.__influx_client.send_selection_query(vm_stats_query) # type: ignore
        if(len(list(result.get_points())) > 0): # type: ignore
            LOGGER.info(">> vm statistics already computed, skipping")
            return

        # ####################### Part 2: Compute new Data ####################
        fields = [
            'uptime',
            'powerState',
            'commited',
            'uncommited',
            'memory',
            'host',
            'vmVersion',
            'isProtected',
            'inHLO',
            'isEncrypted',
            'datacenterName',
            'hypervisorType',
        ]
        query = SelectionQuery(
            keyword=Keyword.SELECT,
            tables=[vms_table],
            fields=fields,
            where_str=where_str
        )
        result = self.__influx_client.send_selection_query(query) # type: ignore

        all_vms_list: List[Dict[str, Union[str, int, float, bool]]] = list(result.get_points()) # type: ignore

        # skip if no new data can be computed
        if(not all_vms_list):
            raise ValueError("no VM's stored, either none are available or store vms first")

        vm_stats: Dict[str, Any] = {}
        try:
            vm_stats['vmCount'] = len(all_vms_list)

            # returns largest/smallest
            vm_stats['vmMaxSize'] = max(all_vms_list, key=(lambda mydict: mydict['commited']))['commited']
            #  on purpose zero size vm's are ignored
            vms_no_null_size = list(filter(lambda mydict: mydict['commited'] > 0, all_vms_list))
            if(vms_no_null_size):
                vm_stats['vmMinSize'] = min(vms_no_null_size, key=(lambda mydict: mydict['commited']))['commited']
            vm_stats['vmSizeTotal'] = sum(mydict['commited'] for mydict in all_vms_list)
            vm_stats['vmAvgSize'] = vm_stats['vmSizeTotal'] / vm_stats['vmCount']

             # returns largest/smallest
            vm_stats['vmMaxUptime'] = max(all_vms_list, key=(lambda mydict: mydict['uptime']))['uptime']
            #  on purpose zero size vm's are ignored
            vms_no_null_time = list(filter(lambda mydict: mydict['uptime'] > 0, all_vms_list))
            if(vms_no_null_time):
                vm_stats['vmMinUptime'] = min(vms_no_null_time, key=(lambda mydict: mydict['uptime']))['uptime']
            vm_stats['vmUptimeTotal'] = sum(mydict['uptime'] for mydict in all_vms_list)
            vm_stats['vmAvgUptime'] = vm_stats['vmUptimeTotal'] / vm_stats['vmCount']

            vm_stats['vmCountProtected'] = len(list(filter(lambda mydict: mydict['isProtected'] == "True", all_vms_list)))
            vm_stats['vmCountUnprotected'] = vm_stats['vmCount'] - vm_stats['vmCountProtected']
            vm_stats['vmCountEncrypted'] = len(list(filter(lambda mydict: mydict['isEncrypted'] == "True", all_vms_list)))
            vm_stats['vmCountPlain'] = vm_stats['vmCount'] - vm_stats['vmCountEncrypted']
            vm_stats['vmCountHLO'] = len(list(filter(lambda mydict: mydict['inHLO'] == "True", all_vms_list)))
            vm_stats['vmCountNotHLO'] = vm_stats['vmCount'] - vm_stats['vmCountHLO']


            vm_stats['vmCountVMware'] = len(list(filter(lambda mydict: mydict['hypervisorType'] == "vmware", all_vms_list)))
            vm_stats['vmCountHyperV'] = len(list(filter(lambda mydict: mydict['hypervisorType'] == "hyperv", all_vms_list)))


            vm_stats['nrDataCenters'] = len(set(map(lambda vm: vm['datacenterName'], all_vms_list)))
            vm_stats['nrHosts'] = len(set(map(lambda vm: vm['host'], all_vms_list)))

            vm_stats['time'] = all_vms_list[0]['time']

            if self.__verbose:
                MethodUtils.my_print([vm_stats])

        except (ZeroDivisionError, AttributeError, KeyError, ValueError) as error:
            ExceptionUtils.exception_info(error=error)
            raise ValueError("error when computing extra vm stats", vm_stats)

        LOGGER.info(">> store vmInventory information in Influx DB")
        self.__influx_client.insert_dicts_to_buffer("vmStats", [vm_stats])
    def store_script_metrics(self) -> None:
        """Stores script metrics into influxb. To be called before exit.

        Does not raise any exceptions, skips if influxdb is missing.
        """
        LOGGER.info("Storing script metrics")
        try:
            if (not self.influx_client):
                raise ValueError("no influxClient set up")
            insert_dict: Dict[str, Union[str, int, float, bool]] = {}

            # add version nr, api calls are needed
            insert_dict["sppmon_version"] = VERSION
            if (self.rest_client):
                try:
                    (version_nr,
                     build) = self.rest_client.get_spp_version_build()
                    insert_dict["spp_version"] = version_nr
                    insert_dict["spp_build"] = build
                except ValueError as error:
                    ExceptionUtils.exception_info(
                        error=error,
                        extra_message="could not query SPP version and build.")

            # end total sppmon runtime
            end_counter = time.perf_counter()
            insert_dict['duration'] = int(
                (end_counter - self.start_counter) * 1000)

            # add arguments of sppmon
            for (key, value) in vars(OPTIONS).items():
                insert_dict[key] = value

            # save occured errors
            error_count = len(ExceptionUtils.stored_errors)
            if (error_count > 0):
                ExceptionUtils.error_message(
                    f"total of {error_count} exception/s occured")
            insert_dict['errorCount'] = error_count
            # save list as str
            insert_dict['errorMessages'] = str(ExceptionUtils.stored_errors)

            # get end timestamp
            (time_key, time_val) = SppUtils.get_capture_timestamp_sec()
            insert_dict[time_key] = time_val

            # save the metrics
            self.influx_client.insert_dicts_to_buffer(
                table_name="sppmon_metrics", list_with_dicts=[insert_dict])
            self.influx_client.flush_insert_buffer()
            LOGGER.info("Stored script metrics sucessfull")
            # + 1 due the "total of x exception/s occured"
            if (error_count + 1 < len(ExceptionUtils.stored_errors)):
                ExceptionUtils.error_message(
                    "A non-critical error occured while storing script metrics. \n\
                    This error can't be saved into the DB, it's only displayed within the logs."
                )
        except ValueError as error:
            ExceptionUtils.exception_info(
                error=error,
                extra_message=
                "Error when storing sppmon-metrics, skipping this step. Possible insert-buffer data loss"
            )
    def create_dashboard(dashboard_folder_path: str,
                         database_name: str) -> None:
        """Creates from the 14 day dashboard a new dashboard for the individual database.
        Alerts are transferred

        Args:
            dashboard_folder_path (str): Path the the folder where the template is located
            database_name (str): name of the database

        Raises:
            ValueError: no path given
            ValueError: no db name given
            ValueError: error when reading or writing files
        """
        if (not dashboard_folder_path):
            raise ValueError(
                "a path to the dashboard template is required to create a new dashboard"
            )
        if (not database_name):
            raise ValueError(
                "need the name of the database to create the new dashboard")

        real_path = os.path.realpath(dashboard_folder_path)
        tmpl_path = os.path.join(real_path,
                                 "SPPMON for IBM Spectrum Protect Plus.json")

        LOGGER.info(f"> trying to open template dashboard on path {tmpl_path}")

        try:
            tmpl_file = open(tmpl_path, "rt")
            file_str = tmpl_file.read()
            tmpl_file.close()
        except Exception as error:
            ExceptionUtils.exception_info(error)
            raise ValueError(
                "Error opening dashboard template. Make sure you've the path to the correct folder (Grafana)."
            )
        LOGGER.info("> Sucessfully opened. Creating new Dashboard")
        # replace name by new one
        name_str = file_str.replace(
            "\"title\": \"SPPMON for IBM Spectrum Protect Plus\"",
            f"\"title\": \"SPPMON for IBM Spectrum Protect Plus {database_name}\""
        )

        # replace uid by new one
        uid_str = re.sub("\"uid\": \".*\"",
                         f"\"uid\": \"14_day_auto_gen_{database_name}\"",
                         name_str)

        # replace all datasource = null by actual datasource
        datasource_str = uid_str.replace(
            "\"datasource\": null",
            f"\"datasource\": \"{database_name}\"",
        )

        LOGGER.info("> finished creating content of dashboard")
        write_path = os.path.join(
            real_path,
            f"SPPMON for IBM Spectrum Protect Plus {database_name}.json")
        LOGGER.info(f"> trying to create dashboard file on path {write_path}")
        try:
            dashboard_file = open(write_path, "wt")
            dashboard_file.write(datasource_str)
            dashboard_file.close()
        except Exception as error:
            ExceptionUtils.exception_info(error)
            raise ValueError("Error creating new dashboard file.")
        LOGGER.info("> Sucessfully created new dashboard file.")
Example #20
0
    def get_vms_per_sla(self) -> List[Dict[str, Any]]:
        """retrieves and calculates all vmware per SLA."""

        endpoint = "/ngp/slapolicy"
        allow_list = ["name", "id"]
        array_name = "slapolicies"

        sla_policty_list = self.__rest_client.get_objects(
            endpoint=endpoint,
            allow_list=allow_list,
            array_name=array_name,
            add_time_stamp=False
        )

        result_list: List[Dict[str, Any]] = []
        for sla_policty in sla_policty_list:
            try:
                sla_name: str = sla_policty["name"]
            except KeyError as error:
                ExceptionUtils.exception_info(error, extra_message="skipping one sla entry due missing name.")
                continue
            sla_id: Optional[str] = sla_policty.get("id", None)

            result_dict: Dict[str, Any] = {}

            ## hotadd:
            sla_name = urllib.parse.quote_plus(sla_name)

            endpoint = "/api/hypervisor/search"
            params = {
                "resourceType": "vm",
                "from": "hlo",
                "pageSize": 1,
                "filter": json.dumps([
                    {
                        "property": "storageProfileName",
                        "value": sla_name,
                        "op": "="
                    }
                ])
            }
            # other options: volume, vm, tag, tagcategory
            post_data = {
                "name": "*",
                "hypervisorType": "vmware",
            }

            (response_json, _) = self.__rest_client.query_url(
                self.__rest_client.get_url(endpoint),
                params,
                RequestType.POST,
                post_data)

            result_dict["slaName"] = sla_name
            result_dict["slaId"] = sla_id
            result_dict["vmCountBySLA"] = response_json.get("total", None)

            time_key, time = SppUtils.get_capture_timestamp_sec()
            result_dict[time_key] = time

            result_list.append(result_dict)

        return result_list