def __init__(self, args): ''' Initializes the metrics tool.''' logger.debug("Enter ToolJobMetrics init") #: The user configurable input to the usecase. self.settings = Settings() self.settings.help_string = '''A tool for retrieving metrics from nodes that participated in the supplied job id. Uses the JobSettings, RemoteServerSettings and StatisticalSettings configuration modules.''' # self.settings.time self.settings.append_class(TimeSettings) # self.settings.job_info self.settings.append_class( JobSettings, ["no_hosts", "target_hostnames", "job_id", "secondary_job_id"]) # self.settings.remote_server self.settings.append_class(RemoteServerSettings) # self.settings.statistics self.settings.append_class(StatisticalSettings) # Parse the arguments and config file. self.settings.parse_args(args) #: The payload object for connecting to the Log Analysis server (UnityPayload). self.payload = UnityPayload() #: Exposes the CSM CLI to python without needing to use subprocess. self.csmi_tool = CSMITool() try: self.payload.create_connection( self.settings.remote_server.construct_uri(), self.settings.remote_server.access["userid"], self.settings.remote_server.access["userpass"]) except Exception as e: # If the login fails, exit the system with a 1. logger.error(e) logger.error( "Please verify that the remote_server section was properly configured." ) logger.error("Error Code 1 \nSettings Used:\n{0}".format( self.settings.get_values_string())) if self.settings.remote_server.access is None: logger.error( "The access setting MUST point to a separate configuration file." ) sys.exit(1) logger.debug("Exit ToolJobMetrics init")
def __init__(self, args): ''' Initializes the tool. ''' logger.debug("Begin ToolJobsRunning init") #: The user configurable input to the usecase. self.settings = Settings( ) self.settings.help_string = '''A tool for retrieving the jobs that were running during the specified time. The -v or --verbose flag will query the csm apis to aggregate more information about the jobs that were found to be running.''' # self.settings.time self.settings.append_class( TimeSettings ) # self.settings.job_info self.settings.append_class( JobSettings, ["target_hostnames", "no_hosts"]) # self.settings.remote_server self.settings.append_class( RemoteServerSettings ) # Parse the arguments and config file. self.settings.parse_args(args) #: The Log Analysis payload for this tool (UnityPayload). self.payload = UnityPayload() #: Exposes the CSM CLI to python without needing to use subprocess. self.csmi_tool = csmi_tool.CSMITool() ''' Initialize the unity connection. ''' try: self.payload.create_connection( self.settings.remote_server.construct_uri(), self.settings.remote_server.access["userid"], self.settings.remote_server.access["userpass"]) except Exception as e: # If the login fails, exit the system with a 1. logger.error(e) logger.error("Please verify that the remote_server section was properly configured.") logger.error("Error Code 1 \nSettings Used:\n{0}".format( self.settings.get_values_string())) if self.settings.remote_server.access is None: logger.error("The access setting MUST point to a separate configuration file.") sys.exit(1) #: Tracks the jobs that were running in the specified time range (dict) self.job_alloc_map = None logger.debug("End ToolJobsRunning init")
def __init__(self, args): ''' Initializes the tool. ''' logger.debug("Begin ToolJobRange init") #: The user configurable input to the usecase. self.settings = Settings( ) self.settings.help_string = '''A tool for retrieving the start and end time of a job in the LogAnalysis Big Data Store.''' # self.settings.time self.settings.append_class( TimeSettings ) # self.settings.job_info self.settings.append_class( JobSettings, ["target_hostnames", "no_hosts", "job_id", "secondary_job_id" ]) # self.settings.remote_server self.settings.append_class( RemoteServerSettings ) # Parse the arguments and config file. self.settings.parse_args(args) #: The Log Analysis payload for this tool (UnityPayload). self.payload = UnityPayload() #: Exposes the CSM CLI to python without needing to use subprocess. self.csmi_tool = CSMITool() #: The payload for the query (UnityPayload). self.payload = UnityPayload() # Initialize the unity connection. try: self.payload.create_connection( self.settings.remote_server.construct_uri(), self.settings.remote_server.access["userid"], self.settings.remote_server.access["userpass"]) except Exception as e: # If the login fails, exit the system with a 1. logger.error(e) logger.error("Please verify that the remote_server section was properly configured.") logger.error("Error Code 1 \nSettings Used:\n{0}".format( self.settings.get_values_string())) if self.settings.remote_server.access is None: logger.error("The access setting MUST point to a separate configuration file.") sys.exit(1) logger.debug("Exit ToolJobRange init")
def __init__(self, args): ''' Initializes the tool. ''' logger.debug("Begin ToolJobKeys init") #: The user configurable input to the usecase. self.settings = Settings() self.settings.help_string = '''A tool for determining the occurance rate of keywords during the run time of a job in the syslog and mmfs logs. User supplied time ranges will be overriden by the actual time range of the specified job.''' # self.settings.time self.settings.append_class(TimeSettings) # self.settings.job_info self.settings.append_class(JobSettings) # self.settings.remote_server self.settings.append_class(RemoteServerSettings) # Parse the arguments and config file. self.settings.parse_args(args) #: The payload for the query (UnityPayload). self.payload = UnityPayload() #: Exposes the CSM CLI to python without needing to use subprocess. self.csmi_tool = CSMITool() # Initialize the unity connection. try: self.payload.create_connection( self.settings.remote_server.construct_uri(), self.settings.remote_server.access["userid"], self.settings.remote_server.access["userpass"]) except Exception as e: # If the login fails, exit the system with a 1. logger.error(e) logger.error( "Please verify that the remote_server section was properly configured." ) logger.error("Error Code 1 \nSettings Used:\n{0}".format( self.settings.get_values_string())) if self.settings.remote_server.access is None: logger.error( "The access setting MUST point to a separate configuration file." ) sys.exit(1) #: Holds the jobs that were running in the specified time range (dict). self.job_id_map = None #: Overall start time for the job (datetime). self.job_start_time = None #: Overall end time for the job (datetime). self.job_end_time = None #: The allocation id, the user will generally not know this id (int). self.allocation_id = 0 logger.debug("Exit ToolJobKeys init")
class ToolJobKeys(object): ''' Facilitates keyword searching against a Log Analysis Server.''' def __init__(self, args): ''' Initializes the tool. ''' logger.debug("Begin ToolJobKeys init") #: The user configurable input to the usecase. self.settings = Settings() self.settings.help_string = '''A tool for determining the occurance rate of keywords during the run time of a job in the syslog and mmfs logs. User supplied time ranges will be overriden by the actual time range of the specified job.''' # self.settings.time self.settings.append_class(TimeSettings) # self.settings.job_info self.settings.append_class(JobSettings) # self.settings.remote_server self.settings.append_class(RemoteServerSettings) # Parse the arguments and config file. self.settings.parse_args(args) #: The payload for the query (UnityPayload). self.payload = UnityPayload() #: Exposes the CSM CLI to python without needing to use subprocess. self.csmi_tool = CSMITool() # Initialize the unity connection. try: self.payload.create_connection( self.settings.remote_server.construct_uri(), self.settings.remote_server.access["userid"], self.settings.remote_server.access["userpass"]) except Exception as e: # If the login fails, exit the system with a 1. logger.error(e) logger.error( "Please verify that the remote_server section was properly configured." ) logger.error("Error Code 1 \nSettings Used:\n{0}".format( self.settings.get_values_string())) if self.settings.remote_server.access is None: logger.error( "The access setting MUST point to a separate configuration file." ) sys.exit(1) #: Holds the jobs that were running in the specified time range (dict). self.job_id_map = None #: Overall start time for the job (datetime). self.job_start_time = None #: Overall end time for the job (datetime). self.job_end_time = None #: The allocation id, the user will generally not know this id (int). self.allocation_id = 0 logger.debug("Exit ToolJobKeys init") def stat_keywords(self): ''' Determines the incidence rate of keywords and displays logs containing the keyword specified (if the verbose flag is set).''' logger.debug("Enter ToolJobKeys stat_keywords") # Get the time range try: self.job_id_map, self.job_start_time, self.job_end_time = \ unity_helper.find_job_time_range_csm( self.payload, self.settings.job_info.job_id, self.settings.job_info.secondary_job_id, self.settings.job_info.target_hostnames, self.settings.time.date, self.settings.time.days ) except ValueError as e: logger.error("Unable find the job time range, error was: %s", e) return 1 if self.job_id_map is None or len(self.job_id_map) == 0: if self.settings.job_info.target_hostnames is not None: logger.warning( "No errors were detected, but jobid '{0}' was not found for targeted hosts: [{1}]. Please consult your settings and try again." .format(self.settings.job_info.job_id, ",".join(self.settings.job_info.target_hostnames))) else: logger.warning( "No errors were detected, but jobid '{0}' was not found. Please consult your settings and try again." .format(self.settings.job_info.job_id)) logger.debug("Exit ToolJobKeys stat_keywords") return 1 if self.settings.default.verbose: unity_helper.print_job_time_range(self.job_id_map, self.settings.job_info.job_id, self.job_start_time, self.job_end_time) # Cache the number of hosts. num_hosts = len(self.job_id_map) logger.debug("Building ToolJobKeys stat_keywords query") self.payload.initialize() self.payload.set_logsources("logSource", ["/syslog", "/mmfs"]) self.payload.set_getAttributes( ["timestamp", "syslogHostname", "message"]) self.payload.set_range_timestamp_filter(self.job_start_time, self.job_end_time) self.payload.set_term_facet("syslogHostname", num_hosts) # Build the repeated query string. query_string = " AND " + self.payload.scala_host_query( 'syslogHostname', self.job_id_map) # Zero the keyword counts. for node in self.job_id_map: self.job_id_map[node]["keyword_counts"] = [0] * len( self.settings.job_info.keywords) logger.debug("stat_keywords query built: %s", self.payload.get_json_payload()) # Cache the baseline variables for the server communication loop. default_start = self.payload.get_start() keyword_index = 0 # Map to store the verbose results for formatting properly. if self.settings.default.verbose: verbose_map = dict() logger.debug("Begin BDS communication") # Execute the search for each keyword. for keyword in self.settings.job_info.keywords: logger.debug("Gathering statistics about \'%s\' keyword", keyword) # Finalize the query. self.payload.set_query("\"" + keyword + "\"" + query_string) while self.payload.get_start() >= 0: logger.debug("Executing stat_keywords \'%s\' keyword query", keyword) # Execute the actual query. json_response = json.loads(self.payload.post()) # If the total results count is found in the response and the results # exceed the results returned increment the start point for the next iteration. # Else set the start to -1 so the execution knows not to iterate. self.payload.set_start( self.payload.determine_start(json_response, self.payload.get_start(), self.payload.get_results())) # TODO Should facetResults et. all be moved to Constants? # get the statistics for each if 'facetResults' in json_response and \ 'term_facet' in json_response["facetResults"] and \ 'counts' in json_response["facetResults"]['term_facet']: logger.debug("Counting for the \'%s\' keyword were found", keyword) for count in json_response["facetResults"]['term_facet'][ 'counts']: self.job_id_map[count['term']]['keyword_counts'][keyword_index] = \ count['count'] # XXX Maybe this should be output to a file? # If the verbose option is set cache the messages for output. if self.settings.default.verbose and "searchResults" in json_response: logger.debug("Search results for the \'%s\' keyword were" +\ " found, gathering for verbose output", keyword) verbose_map[keyword] = dict() for entry in json_response["searchResults"]: attributes = entry.get("attributes") if attributes is None: continue hostname = attributes["syslogHostname"] if hostname not in verbose_map[keyword]: verbose_map[keyword][hostname] = [] if 'mmfsEventDescription' in attributes: message = attributes['mmfsEventDescription'] elif "message" in attributes: message = attributes["message"] # TODO should this timestamp be formatted? if "timestamp" in attributes: message = attributes["timestamp"] + ": " + message verbose_map[keyword][hostname].append(message) ''' End for loop ''' ''' End While Loop ''' logger.debug("Done gathering statistics about \'%s\' keyword", keyword) # Update the loop sentinels. keyword_index += 1 self.payload.set_start(default_start) ''' End For Loop ''' logger.debug("End BDS communication") logger.debug( "Keyword statistics gathering complete, outputing results") # Pretty Print. print("\nSearched from \n{0} to {1}".format( output_helpers.format_timestamp(self.job_start_time), output_helpers.format_timestamp(self.job_end_time))) print("\n{0} \nKeyword Statistics\n{1}".format(DIV_0, DIV_1)) # Print the results to the console. for node in self.job_id_map: key_index = 0 keyword_counts = " " for keyword in self.job_id_map[node]['keyword_counts']: keyword_counts += self.settings.job_info.keywords[key_index] +\ "=" + unicode(keyword) + " " key_index += 1 print(node + keyword_counts) print(DIV_0 + "\n") if self.settings.default.verbose: for keyword in verbose_map: print("\n{0}\nContains Keyword: {1}\n{2}\n".format( DIV_2, keyword, DIV_2)) # TODO this might need to be tokenized. for host in verbose_map[keyword]: print("\n" + DIV_2 + "\n" + host + ":\n" + DIV_2 + "\n") for message in verbose_map[keyword][host]: print message print(DIV_1) logger.debug("Exit ToolJobKeys stat_keywords") return 0
class ToolJobsRunning (object): ''' Contains the functions necessary to check for jobs running at a specified time on a remote LogAnalysis server. ''' def __init__(self, args): ''' Initializes the tool. ''' logger.debug("Begin ToolJobsRunning init") #: The user configurable input to the usecase. self.settings = Settings( ) self.settings.help_string = '''A tool for retrieving the jobs that were running during the specified time. The -v or --verbose flag will query the csm apis to aggregate more information about the jobs that were found to be running.''' # self.settings.time self.settings.append_class( TimeSettings ) # self.settings.job_info self.settings.append_class( JobSettings, ["target_hostnames", "no_hosts"]) # self.settings.remote_server self.settings.append_class( RemoteServerSettings ) # Parse the arguments and config file. self.settings.parse_args(args) #: The Log Analysis payload for this tool (UnityPayload). self.payload = UnityPayload() #: Exposes the CSM CLI to python without needing to use subprocess. self.csmi_tool = csmi_tool.CSMITool() ''' Initialize the unity connection. ''' try: self.payload.create_connection( self.settings.remote_server.construct_uri(), self.settings.remote_server.access["userid"], self.settings.remote_server.access["userpass"]) except Exception as e: # If the login fails, exit the system with a 1. logger.error(e) logger.error("Please verify that the remote_server section was properly configured.") logger.error("Error Code 1 \nSettings Used:\n{0}".format( self.settings.get_values_string())) if self.settings.remote_server.access is None: logger.error("The access setting MUST point to a separate configuration file.") sys.exit(1) #: Tracks the jobs that were running in the specified time range (dict) self.job_alloc_map = None logger.debug("End ToolJobsRunning init") def find_jobs( self ): ''' Finds the jobs running at the specified time. If the verbose flag was specified a secondary query will be run. Please consult jobs_running_during for details regarding the query against the big data store. :returns int: Return Code ''' rc = self.jobs_running_during( ) if self.settings.default.verbose: self.verify_jobs_running() self.print_jobs_running() return rc def jobs_running_during( self ): ''' Finds the jobs running at the specified time. Results are output to the console. Current iteration of the query works as follows: | Key: | a - "--targettime" | b - "--targettime" - "--days" | - - "--days" | | - start/end of scan | ~ - Unexamined days/time | | b a | ~~~|<-------|~~~~~~~~~ | | The query first establishes a time range to search for allocation creations/deletions. | Using the supplied "--targettime" and "--days" the start and end times are computed with | the end time being the target time. Using this time range and any nodes specified a filter | is generated to reduce the number of records returned. | A query is then sent to the Log Analysis server and the results are parsed. | IF create is found | The status of the job (a Boolean) is combined with True on that hostname. | ELSE IF delete is found | The status of the job (a Boolean) is combined with False on that hostname. | | IF the status of the job cached locally is True | The hostname had that job/allocation running at the target time. | Some notes: | 1. If the Job was created before the search window and ends after it will not be detected. | a. "--days" should be viewed as a heuristic value of sorts (e.g. the maximum run time). | 2. If the Job was created, but not properly destroyed this will create a false positive!''' logger.debug("Enter ToolJobsRunning jobs_running_during") # Set the range of time to search in. If no time is specified, usenow for time. if self.settings.time.date: self.end_time = self.settings.time.date else: self.end_time = datetime.now() self.start_time = self.end_time - relativedelta(days=self.settings.time.days) # Build the REST POST payload. self.payload.initialize() self.payload.set_logsources("logSource","/syslog") self.payload.set_getAttributes(["syslogHostname","message","timestamp"]) self.payload.set_range_timestamp_filter(self.start_time, self.end_time) self.payload.set_query(unity_helper.BDS_ALLOCATION_QUERY) # Build query string. if self.settings.job_info.target_hostnames is not None: self.payload.set_query( self.payload.scala_host_query( \ "syslogHostname", self.settings.job_info.target_hostnames), "AND") # Reset the allocation mapping. self.job_alloc_map = dict() logger.debug("Enter ToolJobsRunning UnityConnection") while self.payload.get_start() >= 0: # Execute the actual query. json_response = json.loads( self.payload.post( ) ) logger.info("Processing results: %d->%d out of %s", self.payload.get_start(), self.payload.get_start() + self.payload.get_results(), json_response.get("totalResults")) # Set up the start. self.payload.set_start( self.payload.determine_start( json_response, self.payload.get_start(), self.payload.get_results())) # If an error was found report it and exit. if "result" in json_response and\ json_response["result"].get("status") == "failure": logger.error("Error occured in communication: %s", json_response["result"].get("message")) continue # If the search results were found in the response payload, we can process the data. if UnityResponse.SEARCH_RESULTS in json_response: # Iterate over the search. for entry in json_response[UnityResponse.SEARCH_RESULTS]: attributes = entry[UnityResponse.ATTRIBUTES] # Cache the reused results. search_result = re.search( unity_helper.BDS_ALLOCATION_EXTRACTOR, attributes["message"] ) hostname = attributes["syslogHostname"] timestamp = attributes["timestamp"] if search_result is None: logger.debug("Message didn't have allocation details.") continue alloc_id, al_type, job_id, sec_id = search_result.group(1,2,3,4) if alloc_id not in self.job_alloc_map: # If the allocation id hasn't be found yet create an object for it. self.job_alloc_map[alloc_id] = { "job_id" : job_id, "sec_id" : sec_id, "hostnames" : {}, "active" : 0 } if hostname not in self.job_alloc_map[alloc_id]["hostnames"]: # If the hostname is not present add it and assume it is running. self.job_alloc_map[alloc_id]["hostnames"][hostname] = True if al_type == unity_helper.BDS_ALLOCATION_BEGIN_KEY: # If the begin was found. self.job_alloc_map[alloc_id]["hostnames"][hostname] = True and \ self.job_alloc_map[alloc_id]["hostnames"][hostname] self.job_alloc_map[alloc_id]["active"] += 1 elif al_type == unity_helper.BDS_ALLOCATION_END_KEY: # end was found. self.job_alloc_map[alloc_id]["hostnames"][hostname] = False and \ self.job_alloc_map[alloc_id]["hostnames"][hostname] self.job_alloc_map[alloc_id]["active"] -= 1 logger.debug("Exit ToolJobsRunning UnityConnection") logger.debug("Exit ToolJobsRunning jobs_running_during") # Clear out the inactive allocations. inactive_allocations = [] for alloc_id in self.job_alloc_map: if self.job_alloc_map[alloc_id]["active"] <= 0: inactive_allocations.append(alloc_id) for allocation in inactive_allocations: del self.job_alloc_map[allocation] return 0 def verify_jobs_running( self ): ''' Verify that the job was running at the time stamp specified using csmi api queries. Determines any other nodes that participated in the job.''' logger.debug("Enter ToolJobsRunning verify_jobs_running") if self.job_alloc_map is None: logger.debug("Exit ToolJobsRunning verify_jobs_running. No jobs to verify") return 1 tz_local = tz.tzlocal() for alloc_id in self.job_alloc_map: # Initialize the new metadata. allocation_map = self.job_alloc_map[alloc_id] allocation_map["in_db"] = False allocation_map["verified"] = False allocation_map["run_time"] = "Not Found" allocation_map["start_time"] = "Not Found" allocation_map["end_time"] = "Not Found" allocation_map["other_nodes"]= [] # Query the csm database, if it fails, continue to the next allocation. try : allocation_dict = self.csmi_tool.allocation_query(alloc_id) if allocation_dict is None: logger.debug("Allocation %s was not found in the csm database", alloc_id) continue allocation_map["in_db"] = True except Exception as e: logger.error("Allocation %s was not found in the csm database, error: %s",\ alloc_id, e ) continue # Determine start time. job_start_time = allocation_dict.get(csmi_tool.CSM_AL_BEGIN_TIME) # Set the end time, revert to now, if no end time is found. history_dict = allocation_dict.get(csmi_tool.CSM_AL_HISTORY) if history_dict is not None: job_end_time = history_dict.get(csmi_tool.CSM_AL_END_TIME) else: job_end_time = None # Set the endtime to now andcheck if the tzinfo is none. if job_end_time is None: job_end_time = datetime.now() # The timestamp is assumed to be local time if it's not set. if job_end_time.tzinfo is None: job_end_time = job_end_time.replace(tzinfo=tz_local) # Add time metadata to the allocation map. if job_start_time is not None: # The timestamp is assumed to be local time if it's not set. if self.end_time.tzinfo is None: self.end_time = self.end_time.replace(tzinfo=tz_local) # The timestamp is assumed to be local time if it's not set. if job_start_time.tzinfo is None: job_start_time = job_start_time.replace(tzinfo=tz_local) allocation_map["verified"] = \ self.end_time >= job_start_time and \ self.end_time <= job_end_time allocation_map["run_time"] = job_end_time - job_start_time allocation_map["start_time"] = job_start_time allocation_map["end_time"] = job_end_time # Determine additional nodes that participated. found_hostnames = allocation_dict.get(csmi_tool.CSM_AL_COMPUTE_NODES) if found_hostnames is not None and \ "hostnames" in self.job_alloc_map[alloc_id]: allocation_hostnames = self.job_alloc_map[alloc_id]["hostnames"] for hostname in found_hostnames: if hostname not in allocation_hostnames: allocation_map["other_nodes"].append(hostname) return 0 def print_jobs_running( self ): ''' Print the jobs that were found to be running. Isolates the output from the business logic.''' logger.debug("Enter ToolJobsRunning print_jobs_running") print ("") print (DIV_0) print ("The following jobs were active on the following nodes at " + output_helpers.format_timestamp(self.end_time)) print ("AllocationID | JobID | SecondaryID | Active Hostnames") print (DIV_0) if self.job_alloc_map is None: logger.debug("Exit ToolJobsRunning print_jobs_running") return line_limit = len(DIV_1) line_count = 0 tab_count = 5 tab = " " * tab_count # Print the results to the console. for alloc_id in self.job_alloc_map: active_hosts = 0 output = alloc_id + " | " + self.job_alloc_map[alloc_id]["job_id"] + " | " + \ self.job_alloc_map[alloc_id]["sec_id"] + " | " line_count = len(output) for host in self.job_alloc_map[alloc_id]["hostnames"]: # TODO simplify code. if self.job_alloc_map[alloc_id]["hostnames"][host] : temp_count = (len(host) + 2) line_count += temp_count if line_count > line_limit: output += "\n" + tab line_count = temp_count + tab_count output += host + ", " print (output[:output.rfind(', ')]) if self.settings.default.verbose: print (DIV_2) print ("Found in Database: %s" % \ self.job_alloc_map[alloc_id].get("in_db")) print ("Time Verified : %s" % \ self.job_alloc_map[alloc_id].get("verified")) print ("Running Time : %s" % \ self.job_alloc_map[alloc_id].get("run_time")) print ("Start Time : %s" % \ output_helpers.format_timestamp( self.job_alloc_map[alloc_id].get("start_time"))) print ("End Time : %s" % \ output_helpers.format_timestamp( self.job_alloc_map[alloc_id].get("end_time"))) others = self.job_alloc_map[alloc_id].get("other_nodes") if others is not None: print ("Additional Nodes : %s" % ", ".join(others)) print (DIV_1) print ("") print (DIV_0) logger.debug("Exit ToolJobsRunning print_jobs_running")
class ToolJobRange (object): ''' Class collecting Job Time Range utility. ''' def __init__(self, args): ''' Initializes the tool. ''' logger.debug("Begin ToolJobRange init") #: The user configurable input to the usecase. self.settings = Settings( ) self.settings.help_string = '''A tool for retrieving the start and end time of a job in the LogAnalysis Big Data Store.''' # self.settings.time self.settings.append_class( TimeSettings ) # self.settings.job_info self.settings.append_class( JobSettings, ["target_hostnames", "no_hosts", "job_id", "secondary_job_id" ]) # self.settings.remote_server self.settings.append_class( RemoteServerSettings ) # Parse the arguments and config file. self.settings.parse_args(args) #: The Log Analysis payload for this tool (UnityPayload). self.payload = UnityPayload() #: Exposes the CSM CLI to python without needing to use subprocess. self.csmi_tool = CSMITool() #: The payload for the query (UnityPayload). self.payload = UnityPayload() # Initialize the unity connection. try: self.payload.create_connection( self.settings.remote_server.construct_uri(), self.settings.remote_server.access["userid"], self.settings.remote_server.access["userpass"]) except Exception as e: # If the login fails, exit the system with a 1. logger.error(e) logger.error("Please verify that the remote_server section was properly configured.") logger.error("Error Code 1 \nSettings Used:\n{0}".format( self.settings.get_values_string())) if self.settings.remote_server.access is None: logger.error("The access setting MUST point to a separate configuration file.") sys.exit(1) logger.debug("Exit ToolJobRange init") def find_job_time_range(self): ''' Find the time range of the specified job id.''' logger.debug("Enter ToolJobRange find_job_time_range") # Get the time range and exit if a value error was thrown, # logging the error to the error log. try: job_id_map, start_time, end_time = \ unity_helper.find_job_time_range_csm( self.payload, self.settings.job_info.job_id, self.settings.job_info.secondary_job_id, self.settings.job_info.target_hostnames, self.settings.time.date, self.settings.time.days ) except ValueError as e: logger.error(e) logger.debug("Exit ToolJobRange find_job_time_range") return 1 if job_id_map is None or len(job_id_map) == 0 : if self.settings.job_info.target_hostnames is not None: logger.warning("No errors were detected, but jobid '{0}' was not found for targeted hosts: [{1}]. Please consult your settings and try again.".format( self.settings.job_info.job_id, ",".join(self.settings.job_info.target_hostnames ) ) ) else: logger.warning("No errors were detected, but jobid '{0}' was not found. Please consult your settings and try again.".format( self.settings.job_info.job_id) ) logger.debug("Exit ToolJobRange find_job_time_range") return 1 print("") rc = unity_helper.print_job_time_range( job_id_map, self.settings.job_info.job_id, start_time, end_time) logger.debug("Exit ToolJobRange find_job_time_range") return rc
class ToolJobMetrics(object): ''' A tool for aggregating metrics from nodes that participated in the specified job id over the job execution time.''' def __init__(self, args): ''' Initializes the metrics tool.''' logger.debug("Enter ToolJobMetrics init") #: The user configurable input to the usecase. self.settings = Settings() self.settings.help_string = '''A tool for retrieving metrics from nodes that participated in the supplied job id. Uses the JobSettings, RemoteServerSettings and StatisticalSettings configuration modules.''' # self.settings.time self.settings.append_class(TimeSettings) # self.settings.job_info self.settings.append_class( JobSettings, ["no_hosts", "target_hostnames", "job_id", "secondary_job_id"]) # self.settings.remote_server self.settings.append_class(RemoteServerSettings) # self.settings.statistics self.settings.append_class(StatisticalSettings) # Parse the arguments and config file. self.settings.parse_args(args) #: The payload object for connecting to the Log Analysis server (UnityPayload). self.payload = UnityPayload() #: Exposes the CSM CLI to python without needing to use subprocess. self.csmi_tool = CSMITool() try: self.payload.create_connection( self.settings.remote_server.construct_uri(), self.settings.remote_server.access["userid"], self.settings.remote_server.access["userpass"]) except Exception as e: # If the login fails, exit the system with a 1. logger.error(e) logger.error( "Please verify that the remote_server section was properly configured." ) logger.error("Error Code 1 \nSettings Used:\n{0}".format( self.settings.get_values_string())) if self.settings.remote_server.access is None: logger.error( "The access setting MUST point to a separate configuration file." ) sys.exit(1) logger.debug("Exit ToolJobMetrics init") def get_job_metrics(self): ''' Gets the metrics from the Log Analysis server during the job execution on the specified nodes.''' logger.debug("Enter ToolJobMetrics get_job_metrics") # Get the time range and exit if a value error was thrown, logging the error to the error log. try: job_id_map, start_time, end_time = \ unity_helper.find_job_time_range_csm( self.payload, self.settings.job_info.job_id, self.settings.job_info.secondary_job_id, self.settings.job_info.target_hostnames, self.settings.time.date, self.settings.time.days ) except ValueError as e: logger.error(e) return 1 if job_id_map is None or len(job_id_map) == 0: if self.settings.job_info.target_hostnames is not None: logger.warning( "No errors were detected, but jobid '{0}' was not found for targeted hosts: [{1}]. Please consult your settings and try again." .format(self.settings.job_info.job_id, ",".join(self.settings.job_info.target_hostnames))) else: logger.warning( "No errors were detected, but jobid '{0}' was not found. Please consult your settings and try again." .format(self.settings.job_info.job_id)) logger.debug("Exit ToolJobMetrics get_job_metrics") return 1 if self.settings.default.verbose: unity_helper.print_job_time_range(job_id_map, self.settings.job_info.job_id, start_time, end_time) if job_id_map is not None: keys = job_id_map.keys() else: keys = [] # Get the "filtered" metrics try: ip_addrs = self.modify_unity_payload( self.payload, start_time, end_time, keys, self.settings.statistics.log_sources, self.settings.statistics.log_tags) except ValueError as e: logger.error(e) return 1 metrics, metadata = self.get_metrics_unity( self.payload, self.settings.statistics.log_source_details, keys, ip_addrs) self.print_metrics(metrics, metadata, start_time, end_time, self.settings.job_info.job_id, self.settings.statistics.stat_options) # Get the "unfiltered" metrics ip_addrs = self.modify_unity_payload( self.payload, start_time, end_time, log_sources=self.settings.statistics.log_sources_all, tags=self.settings.statistics.log_tags_all) metrics, metadata = self.get_metrics_unity( self.payload, self.settings.statistics.log_source_details) self.print_metrics(metrics, metadata, start_time, end_time, self.settings.job_info.job_id, self.settings.statistics.stat_options) return 0 @staticmethod def modify_unity_payload(payload, start_time, end_time, nodes=None, log_sources=None, tags=None): ''' Modifies the supplied payload for a metrics query. Executes initialize_payload() before populating the payload for the metrics query. :param UnityPayload payload: A payload object with an existing connection to a Log Analysis Server :param datetime start_time: The start of the range to aggregate metrics for. :param datetime end_time: The end of the range to aggregate metrics for. :param list nodes: The nodes to perform the statistical analysis on. :param list log_sources: A collection of Log Analysis logSource values to search in the query. :param list tags: A collection of Log Analysis tags to search in the query. :returns: A maping of ip to ip address. :rtype: dict''' payload.initialize() # Set the log sources. append = False if log_sources is not None: payload.set_logsources("logSource", log_sources) append = True if tags is not None: payload.set_logsources("tags", tags, append) ip_addrs = None if nodes is not None: # Get the ip address mapping. ip_addrs = ToolJobMetrics.map_ip(nodes) if ip_addrs is not None: payload.set_query( payload.scala_host_query(None, (nodes + ip_addrs.keys()))) # Construct the payload. payload.set_range_timestamp_filter(start_time, end_time) return ip_addrs @staticmethod def get_metrics_unity(payload, sources, nodes=None, ip_addrs=None): ''' Queries a LogAnalysis server and performs metrics analysis on the results. :param UnityPayload payload: A payload object that has been configured through modify_unity_payload with an active connection to a LogAnalysis server. :param dict sources: A dictionary containing the hostname and fields for log sources. :param list nodes: A collection of hostnames/ip addresses :return: A dictionary of metric data and a MetricsMetadata object. :rtype: dict, MetricsMetadata''' # Initialize the metadata and metrics objects. metadata_objects = dict() metrics = {"global": {}} filter_on_nodes = nodes is not None if filter_on_nodes: for node in nodes: metrics[node] = {} # Query the server while payload.get_start() >= 0: json_response = json.loads(payload.post()) payload.set_start(payload.determine_start(json_response)) if UnityResponse.SEARCH_RESULTS not in json_response: continue for entry in json_response[UnityResponse.SEARCH_RESULTS]: # Get and cache the data source for the entry. attributes = entry.get(UnityResponse.ATTRIBUTES) data_source = attributes.get(UnityResponse.DATA_SOURCE) if data_source not in metadata_objects: keys = dict() if sources is not None and \ data_source in sources: keys = sources[data_source] # Parameter expansion is great - John Dunham metadata_objects[data_source] = \ MetricsMetadata( attributes, **keys ) for metric in metrics: metrics[metric][data_source] = \ metadata_objects[data_source].build_metric_dict() # Resolve the hostname this entry belongs to. hostname = attributes.get( metadata_objects[data_source].hostname) if filter_on_nodes and (ip_addrs is not None and hostname in ip_addrs): hostname = ip_addrs[hostname] # Build metrics object for the hostname specified in the entry. if hostname not in metrics: metrics[hostname] = {} if data_source not in metrics[hostname]: metrics[hostname][data_source] = \ metadata_objects[data_source].build_metric_dict() try: if metadata_objects[data_source].timestamp is None: logger.debug("%s data source timestamp was not set") continue # FIXME give the user a choice? # Resolve the timestamp. timestamp = attributes.get( metadata_objects[data_source].timestamp) last_index = timestamp.rfind(':') timestamp = timestamp[:last_index] + "." + timestamp[ last_index + 1:] metric_time = dateutil.parser.parse(timestamp) # Update the count. metric_index = metrics[hostname][data_source]["count"] metrics[hostname][data_source]["count"] += 1 metrics["global"][data_source]["count"] += 1 # Cache the timestamp. raw_index = metadata_objects[data_source].get_raw_index( "timestamp") metrics[hostname][data_source]["raw"][raw_index].insert( metric_index, metric_time) except Exception as e: logger.warning( "Error detected when caching the timestamp for this entry: %s", e) continue for attribute in attributes: if not metadata_objects[data_source].field_exists( attribute): continue raw_index = metadata_objects[data_source].get_raw_index( attribute) if raw_index == 0: continue try: value = float(attributes[attribute]) except ValueError: value = 0.0 metrics[hostname][data_source]["raw"][raw_index].insert( metric_index, value) metrics[hostname][data_source]["sum"][raw_index] += value metrics["global"][data_source]["sum"][raw_index] += value metrics[hostname][data_source]["max"][raw_index] = \ max( value, metrics[hostname][data_source]["max"][raw_index] ) metrics[hostname][data_source]["min"][raw_index] = \ min( value, metrics[hostname][data_source]["min"][raw_index] ) #================================================================================== # Compute total metrics for data_source in metrics["global"]: global_count = metrics["global"][data_source]["count"] for raw_index in range(1, metadata_objects[data_source].num_metrics): metrics["global"][data_source]["avg"][raw_index] = \ metrics["global"][data_source]["sum"][raw_index] / max(global_count, 1) variance_sum = 0 #num_records = 0 for hostname in metrics: if hostname == "global": # TODO Is global std useful? metrics[hostname][data_source]["std"][raw_index] = -1.0 continue # If the datasource didn't aggregate any data for that metric, continue. if data_source not in metrics[hostname]: continue metrics["global"][data_source]["max"][raw_index] = max( metrics["global"][data_source]["max"][raw_index], metrics[hostname][data_source]["max"][raw_index]) metrics["global"][data_source]["min"][raw_index] = min( metrics["global"][data_source]["min"][raw_index], metrics[hostname][data_source]["min"][raw_index]) local_avg = metrics[hostname][data_source]["sum"][raw_index]\ / max( metrics[hostname][data_source]["count"], 1) metrics[hostname][data_source]["avg"][ raw_index] = local_avg metrics[hostname][data_source]["std"][raw_index] = \ ToolJobMetrics.std( metrics[hostname][data_source]["raw"][raw_index], local_avg ) # For computing the average standard deviation of a field. #num_records += len(metrics[hostname][data_source]["raw"][raw_index]) #variance_sum += pow( metrics[hostname][data_source]["std"][raw_index], 2) # XXX I'm not sure if this is "correct" math - John Dunham ([email protected]( #metrics["global"][data_source]["std"][raw_index] =\ # numpy.sqrt( variance_sum / variance_sum ) return metrics, metadata_objects @staticmethod def print_metrics(metrics, metadata, start_time=None, end_time=None, job_id=None, stat_options=["avg", "min", "max", "std"]): ''' Prints out the aggrgated metrics. :param dict metrics: The aggregated metrics. :param MetricsMetadata metadata: The metadata object for the metrics map. :param datetime start_time: The start time of the supplied job. :param datetime end_time: The end time of the supplied job. :param int job_id: The job id. :param list stat_options: Collection of metrics to diplay in the output. ''' logger.debug("Enter ToolJobMetrics.print_metrics") # Output formatters DIV_HEADER = "=" * 5 + " {0} " + "=" * 5 DIV_0 = "-" * 50 DIV_1 = "=" * 50 DIV_2 = "@" * 75 # Header # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ print("\n" + DIV_2) print("\n{0}\nDisplaying the following metrics:\n\t {1}".format( DIV_1, ", ".join(stat_options))) if job_id is not None: print("Job ID: {0}".format(job_id)) if start_time is not None: print("Start Time: {0}".format( output_helpers.format_timestamp(start_time))) if end_time is not None: print("End Time: {0}".format( output_helpers.format_timestamp(end_time))) print(DIV_1) # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # Generator expression headers = ",".join('{0: >12}'.format(stat) for stat in stat_options) + ", key" # Move the global metric to the end of the list. hostnames = [ hostname for hostname in metrics.keys() if hostname != "global" ] hostnames.append("global") # Print out the metrics for hostname in hostnames: print("\n{0}\n{1}:\n".format(DIV_0, hostname)) for data_source in metrics[hostname]: print(DIV_HEADER.format(data_source)) body = [""] * len(metadata[data_source].headers) # Build the body for header_index, metric in enumerate(stat_options): for body_index, field in enumerate( metadata[data_source].headers): stat_index = metadata[data_source].metrics.get(field) try: value = round( metrics[hostname][data_source][metric] [stat_index], 4) if abs(value) == sys.float_info.max: value = "NaN" except: value = "NaN" body[body_index] += "{0: >12},".format(value) # Append the field name to the row. for body_index, field_name in enumerate( metadata[data_source].headers): body[body_index] += " " + field_name print(headers) for stat in body: print(stat) print("") logger.debug("Exit ToolJobMetrics.print_metrics") @staticmethod def std(values, average=None): ''' A passthrough to numpy.std. Performs the standard deviation on the supplied values. :param list values: A list of float values to perform the standard deviation on. :param float average: The average for the standard deviation, currently unused. :returns: The Standard Deviation for the supplied values. :rtype: float''' logger.debug("Enter unity_metrics.std") count = len(values) if count < 1 or not (isinstance(values[0], float) or isinstance(values[0], int)): logger.debug( "Exit unity_metrics.std, standard deviation not computed") return logger.debug("Exit unity_metrics.std") return numpy.std(values) @staticmethod def map_ip(hostnames): ''' Translate a list of hostnames to a maping of ip addresses to hostnames. :param list hostnames: A list of hostnames to map to ip address. :returns: A mapping of ip addresses to hostnames. :rtype: dict''' logger.debug("Entering unity_metrics.map_ip") ip_addrs = dict() for hostname in hostnames: # If the hostname is not in the /etc/hosts, don't crash the execution, # just don't add it to the list. try: ip = socket.gethostbyname(hostname) ip_addrs[ip] = hostname except socket.gaierror as e: logger.warning( "Hostname \'%s\' was not found in /etc/hosts: %s", hostname, e) pass logger.debug("Exiting unity_metrics.map_ip") return ip_addrs