def _parse_df_cmd(ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `df` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if (not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if (not ssh_type): raise ValueError("no sshtype given") if (not ssh_command.table_name): raise ValueError("need table name to insert parsed value") result_lines = ssh_command.result.splitlines() header = result_lines[0].split() # remove "on" header.pop() values: List[Dict[str, Any]] = list( map(lambda row: dict(zip(header, row.split())), result_lines[1:])) # type: ignore for row in values: if ("1G-blocks" in row): row["Size"] = row.pop("1G-blocks") row["Size"] = SppUtils.parse_unit(row['Size']) if ("Avail" in row): row["Available"] = row.pop("Avail") row["Available"] = SppUtils.parse_unit(row['Available']) row["Used"] = SppUtils.parse_unit(row['Used']) row["Use%"] = row["Use%"][:-1] # set default needed fields row['hostName'] = ssh_command.host_name row['ssh_type'] = ssh_type (time_key, time_value) = SppUtils.get_capture_timestamp_sec() row[time_key] = time_value return (ssh_command.table_name, values)
def _parse_ps_cmd(self, ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `df` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if(not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if(not ssh_type): raise ValueError("no sshtype given") if(not ssh_command.table_name): raise ValueError("need table name to insert parsed value") result_lines = ssh_command.result.splitlines() header = result_lines[0].split() values: List[Dict[str, Any]] = list( map(lambda row: dict(zip(header, row.split())), result_lines[1:])) # type: ignore # remove `ps` from commands, it is also tracked values = list(filter(lambda row: row["COMMAND"] in self.__process_grep_list, values)) for row in values: # Remove CPU, it is tracked by TOP-Command (see Issue #71) row.pop("%CPU", None) # Add information row["collectionType"] = "PS" # set default needed fields row['hostName'] = ssh_command.host_name row['ssh_type'] = ssh_type.name (time_key, time_value) = SppUtils.get_capture_timestamp_sec() row[time_key] = time_value row['TIME+'] = row.pop('ELAPSED') row['MEM_ABS'] = SppUtils.parse_unit(row.pop("RSS"),"kib") row['VIRT'] = SppUtils.parse_unit(row.pop('VSZ'), "kib") return (ssh_command.table_name, values)
def transform_time_literal( value: str, single_vals: bool = False) -> Union[str, Tuple[int, int, int]]: """Transforms a time literal into hour/min/seconds literal. Checks before if the literal is valid. Args: value (str): time literal to be transformed single_vals (bool, optional): wheater the result should be a tuple. Defaults to False. Raises: ValueError: no value given ValueError: not a str given ValueError: value is no time literal Returns: Union[str, Tuple[int, int, int]]: influxdb time literal in 0h0m0s format or values as tuple """ if (not value): raise ValueError("need a value to verify the time literal") if (not isinstance(value, str)): raise ValueError( "type of the value for time literal transform is not str") if (not re.match(r"^(\d+(?:[smhdw]))+$", value)): if (value.lower() == "inf"): return "0s" raise ValueError("value does not pass the time literal check", value) match_list = re.findall(r"((\d+)([a-z]+))", value) time_s = 0 for (_, numbers, unit) in match_list: # full is first, but unused time_s += SppUtils.parse_unit(numbers, unit) hours = int(time_s / pow(60, 2)) time_s = time_s % pow(60, 2) mins = int(time_s / pow(60, 1)) seconds = int(time_s % pow(60, 1)) if (single_vals): return (hours, mins, seconds) return f"{hours}h{mins}m{seconds}s"
class JobMethods: """Wrapper for all job related functionality. You may implement new methods in here. Methods: get_all_jobs - incrementally saves all stored jobsessions, even before first execution of sppmon. job_logs -> saves all jobLogs for the jobsessions in influx catalog. """ # only here to maintain for later, unused yet __job_log_allow_list = [ "CTGGA2340", "CTGGA0071", "CTGGA2260", "CTGGA2315", "CTGGA0550", "CTGGA2384" ] # to be moved somewhere else # ######### Add new logs to be parsed here ####################################### # Structure: # Dict with messageID of log as name # value is a tuple of # #1 the tablename # #2 a lambda which maps each elem to a name. Must contain at least one argument! # #3 list of tuples: keys of additional informations to be saved: (#1: key, #2: rename). Part 2 optional, only if rename # the values are delived by the param_list of the joblog # if the value is something like 10sec or 10gb use `parse_unit` to parse it. __supported_ids: Dict[str, Tuple[str, Callable[[List[Any]], Dict[ str, Any]], List[Union[Tuple[str, str], str]]]] = { 'CTGGA2384': ( 'vmBackupSummary', lambda params: { "name": params[0], "proxy": params[1], "vsnaps": params[2], "type": params[3], "transportType": params[4], "transferredBytes": SppUtils.parse_unit(params[5]), "throughputBytes/s": SppUtils.parse_unit(params[6]), "queueTimeSec": SppUtils.parse_unit(params[7]), "protectedVMDKs": params[8], "TotalVMDKs": params[9], "status": params[10] }, ["messageId"] # Additional Information from job-message itself ), 'CTGGA0071': ('vmBackupSummary', lambda params: { 'protectedVMDKs': params[0], 'TotalVMDKs': int(params[1]) + int(params[0]), 'transferredBytes': SppUtils.parse_unit(params[2]), 'throughputBytes/s': SppUtils.parse_unit(params[3]), 'queueTimeSec': SppUtils.parse_unit(params[4]) }, ["messageId"]), 'CTGGA0072': ('vmReplicateSummary', lambda params: { 'total': params[0], 'failed': params[1], 'duration': SppUtils.parse_unit(params[2]) }, []), 'CTGGA0398': ('vmReplicateStats', lambda params: { 'replicatedBytes': SppUtils.parse_unit(params[0]), 'throughputBytes/sec': SppUtils.parse_unit(params[1]), 'duration': SppUtils.parse_unit(params[2], delimiter=':') }, []), 'CTGGR0003': ( 'office365Stats', lambda params: { 'imported365Users': int(params[0]), }, [ # Additional Information from job-message itself, including rename "jobId", "jobSessionId", "jobName", "jobExecutionTime" # used to instantly integrate with other stats ]), 'CTGGA2444': ( 'office365Stats', lambda params: { 'protectedItems': int(params[0]), 'selectedItems': int(params[0]), }, [ "jobId", "jobSessionId", "jobName", "jobExecutionTime" # used to instantly integrate with other stats ]), 'CTGGA2402': ( 'office365TransfBytes', lambda params: # If not matching, this will return a empty dict which is going to be ignored MethodUtils.joblogs_parse_params( r"(\w+)\s*\(Server:\s*([^\s,]+), Transfer Size: (\d+(?:.\d*)?\s*\w*)\)", params[1], lambda match_list: { "itemName": params[0], "itemType": match_list[1], "serverName": match_list[2], "transferredBytes": SppUtils.parse_unit(match_list[3]), }), ["jobId", "jobSessionId", "jobName"]), } """LogLog messageID's which can be parsed by sppmon. Check detailed summary above the declaration.""" def __init__(self, influx_client: Optional[InfluxClient], api_queries: Optional[ApiQueries], job_log_retention_time: str, job_log_types: List[str], verbose: bool): if (not influx_client): raise ValueError( "Job Methods are not available, missing influx_client") if (not api_queries): raise ValueError( "Job Methods are not available, missing api_queries") self.__influx_client = influx_client self.__api_queries = api_queries self.__verbose = verbose self.__job_log_retention_time = job_log_retention_time """used to limit the time jobLogs are queried, only interestig for init call""" self.__job_log_types = job_log_types def get_all_jobs(self) -> None: """incrementally saves all stored jobsessions, even before first execution of sppmon""" job_list = MethodUtils.query_something( name="job list", source_func=self.__api_queries.get_job_list) for job in job_list: job_id = job.get("id", None) job_name = job.get("name", None) # this way to make sure we also catch empty strings if (not job_id or not job_name): ExceptionUtils.error_message( f"skipping, missing name or id for job {job}") continue LOGGER.info( ">> capturing Job information for Job \"{}\"".format(job_name)) try: self.__job_by_id(job_id=job_id) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"error when getting jobs for {job_name}, skipping it") continue def __job_by_id(self, job_id: str) -> None: """Requests and saves all jobsessions for a jobID""" if (not job_id): raise ValueError("need job_id to request jobs for that ID") keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] query = SelectionQuery( keyword=keyword, fields=['id', 'jobName'], tables=[table], where_str= f'jobId = \'{job_id}\' AND time > now() - {table.retention_policy.duration}' # unnecessary filter? ) LOGGER.debug(query) result = self.__influx_client.send_selection_query( # type: ignore query) id_list: List[int] = [] row: Dict[str, Any] = {} # make sure the var exists for row in result.get_points(): # type: ignore id_list.append(row['id']) # type: ignore if (not row): LOGGER.info( f">>> no entries in Influx database found for job with id {job_id}" ) # calculate time to be requested (rp_hours, rp_mins, rp_secs) = InfluxUtils.transform_time_literal( table.retention_policy.duration, single_vals=True) max_request_timestamp = datetime.datetime.now() - datetime.timedelta( hours=float(rp_hours), minutes=float(rp_mins), seconds=float(rp_secs)) unixtime = int(time.mktime(max_request_timestamp.timetuple())) # make it ms instead of s unixtime *= 1000 # retrieve all jobs in this category from REST API, filter to avoid drops due RP LOGGER.debug(f">>> requesting job sessions for id {job_id}") all_jobs = self.__api_queries.get_jobs_by_id(job_id=job_id) # filter all jobs where start time is not bigger then the retention time limit latest_jobs = list( filter(lambda job: job['start'] > unixtime, all_jobs)) missing_jobs = list( filter(lambda job_api: int(job_api['id']) not in id_list, latest_jobs)) if (len(missing_jobs) > 0): LOGGER.info( f">>> {len(missing_jobs)} datasets missing in DB for jobId: {job_id}" ) # Removes `statistics` from jobs self.__compute_extra_job_stats(missing_jobs, job_id) LOGGER.info( f">>> inserting job information of {len(missing_jobs)} jobs into jobs table" ) self.__influx_client.insert_dicts_to_buffer( list_with_dicts=missing_jobs, table_name="jobs") else: LOGGER.info( f">>> no new jobs to insert into DB for job with ID {job_id}") # TODO: artifact from older versions, not replaced yet if self.__verbose: display_number_of_jobs = 5 keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] where_str = 'jobId = \'{}\''.format(job_id) query = SelectionQuery(keyword=keyword, fields=['*'], tables=[table], where_str=where_str, order_direction='DESC', limit=display_number_of_jobs) result = self.__influx_client.send_selection_query( # type: ignore query) # type: ignore result_list: List[str] = list(result.get_points()) # type: ignore job_list_to_print: List[str] = [] for row_str in result_list: job_list_to_print.append(row_str) print() print( "displaying last {} jobs for job with ID {} from database (as available)" .format(display_number_of_jobs, job_id)) MethodUtils.my_print(data=job_list_to_print) def __compute_extra_job_stats(self, list_with_jobs: List[Dict[str, Any]], job_id: str) -> None: """Extracts additional `statistic` list from jobs and removes it from the original list. Computes an additional table out of the data. Args: list_with_jobs (List[Dict[str, Any]]): list with all jobs """ LOGGER.info( f">>> computing additional job statistics for jobId: {job_id}") insert_list: List[Dict[str, Any]] = [] # check for none instead of bool-check: Remove empty statistic lists []. for job in filter(lambda x: x.get("statistics", None) is not None, list_with_jobs): job_statistics_list = job.pop('statistics') for job_stats in job_statistics_list: try: insert_dict: Dict[str, Any] = {} # fields insert_dict['resourceType'] = job_stats.get( 'resourceType', None) insert_dict['total'] = job_stats.get('total', 0) insert_dict['success'] = job_stats.get('success', 0) insert_dict['failed'] = job_stats.get('failed', 0) skipped = job_stats.get('skipped', None) if (skipped is None): skipped = insert_dict["total"] - insert_dict[ "success"] - insert_dict["failed"] insert_dict["skipped"] = skipped # time key insert_dict['start'] = job['start'] # regular tag values for grouping: insert_dict['id'] = job.get('id', None) insert_dict['jobId'] = job.get('jobId', None) insert_dict['status'] = job.get('status', None) insert_dict['indexStatus'] = job.get('indexStatus', None) insert_dict['jobName'] = job.get('jobName', None) insert_dict['type'] = job.get('type', None) insert_dict['subPolicyType'] = job.get( 'subPolicyType', None) insert_list.append(insert_dict) except KeyError as error: ExceptionUtils.exception_info( error=error, extra_message= f"failed to compute job-individual statistics due key error. report to developer. Job: {job} ; job_stats: {job_stats}" ) if (len(insert_list) > 0): self.__influx_client.insert_dicts_to_buffer( list_with_dicts=insert_list, table_name="jobs_statistics") else: LOGGER.info( f">>> no additional job statistics to insert into DB for jobId: {job_id}" ) def __job_logs_to_stats(self, list_with_logs: List[Dict[str, Any]]) -> None: """Parses joblogs into their own statisic table, using declared supported ID's To parse more jobLogs define additional entrys in the attribute `supported_ids`. Arguments: list_with_logs {List[Dict[str, Any]]} -- List with all saved joblogs """ # only continue with joblogs we want to save supported_log_iterator = filter( lambda log: log['messageId'] in self.__supported_ids.keys(), list_with_logs) sorted_log_iterator = sorted(supported_log_iterator, key=lambda entry: entry['logTime']) max_sec_timestamp = 0 # required for preventing duplicates for job_log in sorted_log_iterator: message_id = job_log['messageId'] table_func_triple = self.__supported_ids[message_id] (table_name, row_dict_func, additional_fields) = table_func_triple if (not table_name): table_name = message_id ExceptionUtils.error_message( f"Warning: No tablename specified for message_id {message_id}. Please report to developer." ) try: # Saving information from the message-params list within the job_log row_dict = row_dict_func(job_log['messageParams']) if (not row_dict): # this was matched incorrectly, therefore skipped. # No warning cause this will happen often. continue # Saving additional fields from the job_log struct itself. if (additional_fields): for value in additional_fields: # with rename if (isinstance(value, Tuple)): row_dict[value[0]] = job_log[value[1]] else: # without rename row_dict[value] = job_log[value] except (KeyError, IndexError) as error: ExceptionUtils.exception_info( error, extra_message= f"MessageID params wrong defined. Skipping message_id {message_id} with content: {job_log}" ) continue # Issue 9, In case where all tag values duplicate another record, including the timestamp, Influx will throw the insert # out as a duplicate. In some cases, the changing of epoch timestamps from millisecond to second precision is # cause duplicate timestamps. To avoid this for certain tables, add seconds to the timestamp as needed to # ensure uniqueness. Only use this when some innacuracy of the timestamps is acceptable cur_timestamp = job_log['logTime'] if (table_name == 'vmBackupSummary'): if (cur_timestamp is None): # prevent None ExceptionUtils.error_message( f"Warning: logTime is None, duplicate may be purged. Log: {job_log}" ) if (isinstance(cur_timestamp, str)): # make sure its int cur_timestamp = int(cur_timestamp) cur_sec_timestamp = SppUtils.to_epoch_secs(cur_timestamp) if (cur_sec_timestamp <= max_sec_timestamp): digits = (int)(cur_timestamp / cur_sec_timestamp) max_sec_timestamp += 1 # increase by 1 second cur_timestamp = max_sec_timestamp * digits else: max_sec_timestamp = cur_sec_timestamp row_dict['time'] = cur_timestamp for (key, item) in row_dict.items(): if (item in ('null', 'null(null)')): row_dict[key] = None self.__influx_client.insert_dicts_to_buffer(table_name, [row_dict]) def job_logs(self) -> None: """saves all jobLogs for the jobsessions in influx catalog. Make sure to call `get_all_jobs` before to aquire all jobsessions. In order to save them it deletes and rewrites all affected jobsession entrys. It automatically parses certain jobLogs into additional stats, defined by `supported_ids`. """ # total count of requested logs logs_requested_total = 0 # total count of inserted logs logs_to_stats_total = 0 # should be equal, but on failure isnt (skipped logs) # list to be inserted after everything is updated job_update_list: List[Dict[str, Any]] = [] LOGGER.info("> Requesting jobs with missing logs from influx database") table = self.__influx_client.database['jobs'] # only store if there is something to store -> limited by job log rentation time. where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time where_str += f' AND time > now() - {table.retention_policy.duration}' # Select all jobs without joblogs keyword = Keyword.SELECT query = SelectionQuery(keyword=keyword, tables=[table], fields=['*'], where_str=where_str) # send query and compute missing_logs_jobs_rs = self.__influx_client.send_selection_query( # type: ignore query) # this list contains all jobs which are missing its Logs # Cast from resultset into list missing_logs_jobs: List[Dict[str, Any]] = list( missing_logs_jobs_rs.get_points()) # type: ignore LOGGER.info( f">>> Number of jobs with no joblogs stored in Influx database: {len(missing_logs_jobs)}" ) LOGGER.info("> Requesting missing jobLogs from REST-API.") # request all jobLogs from REST-API # counter only for displaying purposes for counter, row in enumerate(missing_logs_jobs, 0): # Only print every 5 rows if not verbose # starts at 0, therefore already updated if (self.__verbose or counter % 5 == 0): LOGGER.info( f">>> computed joblogs for {counter} / {len(missing_logs_jobs)} job sessions." ) job_session_id: Optional[int] = row.get('id', None) # if somehow jobLogid is missing: skip # Should usually not happen if (job_session_id is None): ExceptionUtils.error_message( f"Error: jobSessionId missing for row {row}") continue if (self.__verbose): LOGGER.info( f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}." ) LOGGER.debug( f">>> Requesting jobLogs {self.__job_log_types} for session {job_session_id}." ) try: # cant use `query_something` like in other places due the extra params: # api_queries - query_something only works with no params # This list contains all joblogs for a single job-execution current_job_logs = self.__api_queries.get_job_log_details( jobsession_id=job_session_id, job_logs_types=self.__job_log_types, request_ids=list(self.__supported_ids.keys())) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message= f"Error when api-requesting joblogs for job_session_id {job_session_id}, skipping it" ) continue job_log_count = len(current_job_logs) logs_requested_total += job_log_count if (self.__verbose): LOGGER.info( f">>> Found {job_log_count} logs for jobsessionId {job_session_id}" ) LOGGER.debug( f"Found {job_log_count} logs for jobsessionId {job_session_id}" ) # #################################################################################### # Compute results and save logs # ##################################################################################### # The request of REST-API Logs is finished here # To not crash by saving 100.000+ Logs, directly compute results and insert them # ###################################################################################### for job_log in current_job_logs: # add additional information from job-session itself job_log["jobId"] = row.get("jobId", None) job_log["jobName"] = row.get("jobName", None) job_log["jobExecutionTime"] = row.get("start", None) # rename for clarity job_log["jobLogId"] = job_log.pop("id", None) job_log["jobSessionId"] = job_log.pop("jobsessionId", None) # ########################################################## # compute jobLog-Stats into each associated table # ########################################################## try: self.__job_logs_to_stats(current_job_logs) except ValueError as error: ExceptionUtils.exception_info( error, extra_message= f"Failed parse jobLogs into its own table, skipping for jobsessionId {job_session_id}" ) logs_to_stats_total += job_log_count # ########################################################## # save logs within the joblog-dump # ########################################################## # Only dump them after computing stats since they are read within the computing stats part for job_log in current_job_logs: # dump message params to allow saving as string job_log["messageParams"] = json.dumps(job_log["messageParams"]) # if list is empty due beeing erased etc it will simply return and do nothing self.__influx_client.insert_dicts_to_buffer( list_with_dicts=current_job_logs, table_name="jobLogs") # shallow copy dict to allow a update without errors copied_jobsession = dict(row.items()) # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails) update_fields = { "jobLogsCount": job_log_count, "jobsLogsStored": True } # update the fields for (key, value) in update_fields.items(): copied_jobsession[key] = value job_update_list.append(copied_jobsession) # ########################################################## # End of For-Each # ########################################################## # ########################################################## # Delete each job, then re-insert # ########################################################## # Delete all jobs which got requested, no matter if failed delete_query = SelectionQuery(keyword=Keyword.DELETE, tables=[table], where_str=where_str) # now send remove query to prevent data loss self.__influx_client.send_selection_query(delete_query) # type: ignore # Insert data after everything is completed self.__influx_client.insert_dicts_to_buffer(table.name, job_update_list) if (logs_requested_total != logs_to_stats_total): LOGGER.info( f"> Requested a total of {logs_requested_total} but only computed {logs_to_stats_total} into sppmon statistics" ) else: LOGGER.info( f">>> requested and computed a total of {logs_requested_total} logs" ) LOGGER.info(f">> Updated a total of {len(job_update_list)} jobs")
def _parse_pool_show_cmd( ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `vsnap --json pool show` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if (not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if (not ssh_type): raise ValueError("no sshtype given") pool_result_list: List[Dict[str, Any]] = [] try: result: Dict[str, List[Dict[str, Any]]] = json.loads(ssh_command.result) except json.decoder.JSONDecodeError: # type: ignore raise ValueError("cant decode json for pool command", ssh_command.result, ssh_command, ssh_type) for pool in result['pools']: pool_dict: Dict[str, Any] = {} # acts as white list insert_list = [ 'compression', 'compression_ratio', 'deduplication', 'deduplication_ratio', 'diskgroup_size', 'encryption.enabled', 'health', 'id', 'name', 'pool_type', 'size_before_compression', 'size_before_deduplication', 'size_free', 'size_total', 'size_used', 'status' ] for item in insert_list: (key, value) = SppUtils.get_nested_kv(item, pool) pool_dict[key] = value # rename pool_dict['encryption_enabled'] = pool_dict.pop('enabled') # change unit from bytes to megabytes try: sz_b_c = SppUtils.parse_unit( pool_dict['size_before_compression']) sz_b_d = SppUtils.parse_unit( pool_dict['size_before_deduplication']) sz_fr = SppUtils.parse_unit(pool_dict['size_free']) sz_t = SppUtils.parse_unit(pool_dict['size_total']) sz_u = SppUtils.parse_unit(pool_dict['size_used']) pool_dict['size_before_compression'] = int( sz_b_c / pow(2, 20)) if sz_b_c else None pool_dict['size_before_deduplication'] = int( sz_b_d / pow(2, 20)) if sz_b_d else None pool_dict['size_free'] = int(sz_fr / pow(2, 20)) if sz_fr else None pool_dict['size_total'] = int(sz_t / pow(2, 20)) if sz_t else None pool_dict['size_used'] = int(sz_u / pow(2, 20)) if sz_u else None except KeyError as error: ExceptionUtils.exception_info( error=error, extra_message= f"failed to reduce size of vsnap pool size for {pool_dict}" ) # set default needed fields pool_dict['hostName'] = ssh_command.host_name pool_dict['ssh_type'] = ssh_type.name pool_result_list.append(pool_dict) return (ssh_command.table_name, pool_result_list)
def _parse_top_cmd(ssh_command: SshCommand, ssh_type: SshTypes) -> Tuple[str, List[Dict[str, Any]]]: """Parses the result of the `top` command, splitting it into its parts. Arguments: ssh_command {SshCommand} -- command with saved result ssh_type {SshTypes} -- type of the client Raises: ValueError: no command given or no result saved ValueError: no ssh type given Returns: Tuple[str, List[Dict[str, Any]]] -- Tuple of the tablename and a insert list """ if (not ssh_command or not ssh_command.result): raise ValueError("no command given or empty result") if (not ssh_type): raise ValueError("no sshtype given") result_lines = ssh_command.result.splitlines() header = result_lines[6].split() values: List[Dict[str, Any]] = list( map(lambda row: dict(zip(header, row.split())), result_lines[7:])) # type: ignore ram_line = result_lines[3].split() total_mem = SppUtils.parse_unit(data=ram_line[3], given_unit="KiB") time_pattern = re.compile(r"(\d+):(\d{2})(?:\.(\d{2}))?") # remove top statistic itself to avoid spam with useless information values = list( filter( lambda row: row["COMMAND"] in ["mongod", "beam.smp", "java"], values)) for row in values: # set default needed fields row['hostName'] = ssh_command.host_name row['ssh_type'] = ssh_type.name (time_key, time_value) = SppUtils.get_capture_timestamp_sec() row[time_key] = time_value # split time into seconds match = re.match(time_pattern, row['TIME+']) if (match): time_list = match.groups() (hours, minutes, seconds) = time_list if (seconds is None): seconds = 0 time = int(hours) * pow(60, 2) + int(minutes) * pow( 60, 1) + int(seconds) * pow(60, 0) else: time = None row['TIME+'] = time row['MEM_ABS'] = int((float(row['%MEM']) * total_mem) / 100) row['SHR'] = SppUtils.parse_unit(row['SHR']) row['RES'] = SppUtils.parse_unit(row['RES']) row['VIRT'] = SppUtils.parse_unit(row['VIRT']) return (ssh_command.table_name, values)
class JobMethods: """Wrapper for all job related functionality. You may implement new methods in here. Methods: get_all_jobs - incrementally saves all stored jobsessions, even before first execution of sppmon. job_logs -> saves all jobLogs for the jobsessions in influx catalog. """ # only here to maintain for later, unused yet __job_log_white_list = [ "CTGGA2340", "CTGGA0071", "CTGGA2260", "CTGGA2315", "CTGGA0550", "CTGGA2384" ] # to be moved somewhere else # ######### Add new logs to be parsed here ####################################### # Structure: # Dict with messageID of log as name # value is a tuple of #1 the tablename # #2 a lambda which maps each elem to a name # the values are delived by the param_list of the joblog # if the value is something like 10sec or 10gb use `parse_unit` to parse it. __supported_ids: Dict[str, Tuple[str, Callable[[List[Any]], Dict[str, Any]]]] = { 'CTGGA2384': ('vmBackupSummary', lambda params: { "name": params[0], "proxy": params[1], "vsnaps": params[2], "type": params[3], "transportType": params[4], "transferredBytes": SppUtils.parse_unit(params[5]), "throughputBytes/s": SppUtils.parse_unit(params[6]), "queueTimeSec": SppUtils.parse_unit(params[7]), "protectedVMDKs": params[8], "TotalVMDKs": params[9], "status": params[10] } ), 'CTGGA0071': ('vmBackupSummary', lambda params: { 'protectedVMDKs': params[0], 'TotalVMDKs': int(params[1]) + int(params[0]), 'transferredBytes': SppUtils.parse_unit(params[2]), 'throughputBytes/s': SppUtils.parse_unit(params[3]), 'queueTimeSec': SppUtils.parse_unit(params[4]) } ), 'CTGGA0072': ('vmReplicateSummary', lambda params: { 'total': params[0], 'failed': params[1], 'duration': SppUtils.parse_unit(params[2]) } ), 'CTGGA0398': ('vmReplicateStats', lambda params: { 'replicatedBytes': SppUtils.parse_unit(params[0]), 'throughputBytes/sec': SppUtils.parse_unit(params[1]), 'duration': SppUtils.parse_unit(params[2], delimiter=':') } ) } """LogLog messageID's which can be parsed by sppmon. Check detailed summary above the declaration.""" def __init__(self, influx_client: Optional[InfluxClient], api_queries: Optional[ApiQueries], job_log_retention_time: str, job_log_type: str, verbose: bool): if(not influx_client): raise ValueError("Job Methods are not available, missing influx_client") if(not api_queries): raise ValueError("Job Methods are not available, missing api_queries") self.__influx_client = influx_client self.__api_queries = api_queries self.__verbose = verbose self.__job_log_retention_time = job_log_retention_time """used to limit the time jobLogs are queried, only interestig for init call""" self.__job_log_type = job_log_type def get_all_jobs(self) -> None: """incrementally saves all stored jobsessions, even before first execution of sppmon""" job_list = MethodUtils.query_something( name="job list", source_func=self.__api_queries.get_job_list ) for job in job_list: job_id = job.get("id", None) job_name = job.get("name", None) # this way to make sure we also catch empty strings if(not job_id or not job_name): ExceptionUtils.error_message(f"skipping, missing name or id for job {job}") continue LOGGER.info(">> capturing Job information for Job \"{}\"".format(job_name)) try: self.__job_by_id(job_id=job_id) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message=f"error when getting jobs for {job_name}, skipping it") continue def __job_by_id(self, job_id: str) -> None: """Requests and saves all jobsessions for a jobID""" if(not job_id): raise ValueError("need job_id to request jobs for that ID") keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] query = SelectionQuery( keyword=keyword, fields=['id', 'jobName'], tables=[table], where_str=f'jobId = \'{job_id}\' AND time > now() - {table.retention_policy.duration}' # unnecessary filter? ) LOGGER.debug(query) result = self.__influx_client.send_selection_query(query) # type: ignore id_list: List[int] = [] row: Dict[str, Any] = {} # make sure the var exists for row in result.get_points(): # type: ignore id_list.append(row['id']) # type: ignore if(not row): LOGGER.info( f">>> no entries in Influx database found for job with id {job_id}") # calculate time to be requested (rp_hours, rp_mins, rp_secs) = InfluxUtils.transform_time_literal( table.retention_policy.duration, single_vals=True) max_request_timestamp = datetime.datetime.now() - datetime.timedelta( hours=float(rp_hours), minutes=float(rp_mins), seconds=float(rp_secs) ) unixtime = int(time.mktime(max_request_timestamp.timetuple())) # make it ms instead of s unixtime *= 1000 # retrieve all jobs in this category from REST API, filter to avoid drops due RP LOGGER.debug(f">>> requesting job sessions for id {job_id}") all_saved_jobs = self.__api_queries.get_jobs_by_id( job_id=job_id, timestamp_min=unixtime ) missing_jobs = list(filter(lambda job_api: int(job_api['id']) not in id_list, all_saved_jobs)) if(len(missing_jobs) > 0): LOGGER.info(f">>> {len(missing_jobs)} datasets missing in DB for jobId: {job_id}") LOGGER.info(f">>> inserting job information of {len(missing_jobs)} jobs into jobs table") self.__influx_client.insert_dicts_to_buffer( list_with_dicts=missing_jobs, table_name="jobs") else: LOGGER.info( f">>> no new jobs to insert into DB for job with ID {job_id}") # TODO: artifact from older versions, not replaced yet if self.__verbose: display_number_of_jobs = 5 keyword = Keyword.SELECT table = self.__influx_client.database['jobs'] where_str = 'jobId = \'{}\''.format(job_id) query = SelectionQuery( keyword=keyword, fields=['*'], tables=[table], where_str=where_str, order_direction='DESC', limit=display_number_of_jobs ) result = self.__influx_client.send_selection_query(query) # type: ignore result_list: List[str, Any] = list(result.get_points()) # type: ignore job_list_to_print: List[str] = [] for row_str in result_list: job_list_to_print.append(row_str) print() print("displaying last {} jobs for job with ID {} from database (as available)".format( display_number_of_jobs, job_id)) MethodUtils.my_print(data=job_list_to_print) def __job_logs_to_stats(self, list_with_logs: List[Dict[str, Any]]) -> None: """Parses joblogs into their own statisic table, using declared supported ID's To parse more jobLogs define additional entrys in the attribute `supported_ids`. Arguments: list_with_logs {List[Dict[str, Any]]} -- List with all saved joblogs """ # only continue with joblogs we want to save supported_log_iterator = filter(lambda log: log['messageId'] in self.__supported_ids.keys(), list_with_logs) sorted_log_iterator = sorted(supported_log_iterator, key=lambda entry: entry['logTime']) max_sec_timestamp = 0 # required for preventing duplicates for job_log in sorted_log_iterator: message_id = job_log['messageId'] table_func_tuple = self.__supported_ids[message_id] (table_name, row_dict_func) = table_func_tuple if(not table_name): table_name = message_id try: row_dict = row_dict_func(job_log['messageParams']) except KeyError as error: ExceptionUtils.exception_info( error, extra_message="MessageID params wrong defined. Skipping one MessageId") continue row_dict['messageId'] = message_id # Issue 9, In case where all tag values duplicate another record, including the timestamp, Influx will throw the insert # out as a duplicate. In some cases, the changing of epoch timestamps from millisecond to second precision is # cause duplicate timestamps. To avoid this for certain tables, add seconds to the timestamp as needed to # ensure uniqueness. Only use this when some innacuracy of the timestamps is acceptable cur_timestamp = job_log['logTime'] if(table_name == 'vmBackupSummary'): if(cur_timestamp is None): # prevent None ExceptionUtils.error_message(f"Warning: logTime is None, duplicate may be purged. Log: {job_log}") if(isinstance(cur_timestamp, str)): # make sure its int cur_timestamp = int(cur_timestamp) cur_sec_timestamp = SppUtils.to_epoch_secs(cur_timestamp) if(cur_sec_timestamp <= max_sec_timestamp): digits = (int)(cur_timestamp / cur_sec_timestamp) max_sec_timestamp += 1 # increase by 1 second cur_timestamp = max_sec_timestamp * digits else: max_sec_timestamp = cur_sec_timestamp row_dict['time'] = cur_timestamp for(key, item) in row_dict.items(): if(item in ('null', 'null(null)')): row_dict[key] = None self.__influx_client.insert_dicts_to_buffer(table_name, [row_dict]) def job_logs(self) -> None: """saves all jobLogs for the jobsessions in influx catalog. Make sure to call `get_all_jobs` before to aquire all jobsessions. In order to save them it deletes and rewrites all affected jobsession entrys. It automatically parses certain jobLogs into additional stats, defined by `supported_ids`. """ table = self.__influx_client.database['jobs'] # only store if there is something to store -> limited by job log rentation time. where_str = 'jobsLogsStored <> \'True\' and time > now() - %s' % self.__job_log_retention_time where_str += f' AND time > now() - {table.retention_policy.duration}' jobs_updated = 0 logs_total_count = 0 LOGGER.info("> getting joblogs for jobsessions without saved logs") LOGGER.info(">> requesting jobList from database") # Select all jobs without joblogs keyword = Keyword.SELECT query = SelectionQuery( keyword=keyword, tables=[table], fields=['*'], where_str=where_str ) # send query and compute result = self.__influx_client.send_selection_query(query) # type: ignore result_list: List[Dict[str, Any]] = list(result.get_points()) # type: ignore rows_affected = len(result_list) LOGGER.info(">>> number of jobs with no joblogs stored in Influx database: {}" .format(rows_affected)) job_log_dict: Dict[int, List[Dict[str, Any]]] = {} # request all jobLogs from REST-API # if errors occur, skip single row and debug for row in result_list: job_session_id: Optional[int] = row.get('id', None) # if somehow id is missing: skip if(job_session_id is None): ExceptionUtils.error_message(f"Error: joblogId missing for row {row}") continue if(job_session_id in job_log_dict): ExceptionUtils.error_message(f"Error: joblogId duplicate, skipping.{job_session_id}") continue if(self.__verbose): LOGGER.info( f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions.") elif(len(job_log_dict) % 5 == 0): LOGGER.info( f">>> requested joblogs for {len(job_log_dict)} / {rows_affected} job sessions.") # request job_session_id try: if(self.__verbose): LOGGER.info(f"requesting jobLogs {self.__job_log_type} for session {job_session_id}.") LOGGER.debug(f"requesting jobLogs {self.__job_log_type} for session {job_session_id}.") # cant use query something like everwhere due the extra params needed job_log_list = self.__api_queries.get_job_log_details( jobsession_id=job_session_id, job_logs_type=self.__job_log_type) except ValueError as error: ExceptionUtils.exception_info( error=error, extra_message=f"error when api-requesting joblogs for job_session_id {job_session_id}, skipping it") continue if(self.__verbose): LOGGER.info(f">>> Found {len(job_log_list)} logs for jobsessionId {job_session_id}") LOGGER.debug(f"Found {len(job_log_list)} logs for jobsessionId {job_session_id}") # default empty list if no details available -> should not happen, in for safty reasons # if this is none, go down to rest client and fix it. Should be empty list. if(job_log_list is None): job_log_list = [] ExceptionUtils.error_message( "A joblog_list was none, even if the type does not allow it. Please report to developers.") job_log_dict[job_session_id] = job_log_list # list to be inserted after everything is updated insert_list: List[Dict[str, Any]] = [] # Query data in ranges to avoid too many requests # Results from first select query above for row in result_list: job_id: int = row['id'] job_log_list: Optional[List[Dict[str, Any]]] = job_log_dict.get(job_id, None) if(job_log_list is None): ExceptionUtils.error_message( f"missing job_log_list even though it is in influxdb for jobId {job_id}. Skipping it") continue # jobLogsCount will be zero if jobLogs are deleted after X days by maintenance jobs, GUI default is 60 days job_logs_count = len(job_log_list) if(self.__verbose): LOGGER.info(">>> storing {} joblogs for jobsessionId: {} in Influx database".format( len(job_log_list), job_id)) LOGGER.debug(">>> storing {} joblogs for jobsessionId: {} in Influx database".format( len(job_log_list), job_id)) # compute other stats out of jobList try: self.__job_logs_to_stats(job_log_list) except ValueError as error: ExceptionUtils.exception_info( error, extra_message=f"Failed to compute stats out of job logs, skipping for jobsessionId {job_id}") for job_log in job_log_list: # rename key 'id' to jobLogId and reformat messageParams job_log["jobSessionId"] = row.get("jobId", None) job_log["jobSessionName"] = row.get("jobName", None) job_log["jobLogId"] = job_log.pop("id") job_log["messageParams"] = json.dumps( job_log["messageParams"]) # if list is empty due beeing erased etc it will simply return and do nothing self.__influx_client.insert_dicts_to_buffer( list_with_dicts=job_log_list, table_name="jobLogs") jobs_updated += 1 logs_total_count += job_logs_count # update job table and set jobsLogsStored = True, jobLogsCount = len(jobLogDetails) update_fields = { "jobLogsCount": job_logs_count, "jobsLogsStored": True } # copy dict to allow update without errors mydict = dict(row.items()) # update fields for(key, value) in update_fields.items(): mydict[key] = value insert_list.append(mydict) # Delete data to allow reinsert with different tags delete_query = SelectionQuery( keyword=Keyword.DELETE, tables=[table], where_str=where_str ) # now send remove query to prevent data loss self.__influx_client.send_selection_query(delete_query) # Insert data after everything is completed self.__influx_client.insert_dicts_to_buffer(table.name, insert_list) LOGGER.info(">>> inserted a total of {} logs".format(logs_total_count))