def log(header="", message="", send_to_remote=False): event_string = '%s: %s' % (header, message) buf = packer.pack_event_v1(event_string) logging.log_message(buf) if send_to_remote: remote_link.send_message(buf) return True
def read_loopback(devices_valid): '''Ready from FILE_LO_INTERFACES, updates Device. No return value Check implemented: no duplicated endpoint endpoint using valid device ''' log_message(read_loopback.__name__, 'looback interfaces reading started') lines = read_lines_from_file(FILE_LO_INTERFACES) devices = [] endpoints = [] devices_valid_dict = {device.hostname: device for device in devices_valid} for line in lines: fields = line.split(',') device = Device(fields[0]) devices.append(device) loopback_interface = LoopbackInterface(fields[1], fields[2]) endpoint = Endpoint(device, loopback_interface) endpoints.append(endpoint) try: devices_valid_dict[device.hostname].interfaces_lo.append( loopback_interface) except KeyError: print(f"{device.hostname} key not found in devices_valid") if not set(devices).issubset(set(devices_valid)): print(f"Invalid devices found:{set(devices) - set(devices_valid)}") interrupt_execution_with_errors(read_loopback.__name__, "Invalid device read") if not len(endpoints) == len(set(endpoints)): interrupt_execution_with_errors(read_loopback.__name__, "Loopback Duplicated endpoint found") log_message(read_loopback.__name__, 'loopback interfaces reading finished')
def finilize_config_files(devices): '''Finilize the config file for each device''' log_message(finilize_config_files.__name__, 'finalizing config started') config = [] config.append(CONFIG_FINALIZE) for device in devices: write_device_config_to_file(device.hostname, config) log_message(finilize_config_files.__name__, 'finalizing config completed')
def write_lines_to_file(filename, lines): '''Given a filename and a list, writes each list element as a line in the file''' log_message(write_lines_to_file.__name__, 'lines writing started') file_handler = open(filename, 'a+') lines = map(lambda x: x + '\n', lines) file_handler.writelines(lines) file_handler.close() log_message(write_lines_to_file.__name__, 'lines writing completed')
def generate_hostname_config(devices): '''Generates the configuration required to set the hostname Hostname is set to device name''' log_message(generate_hostname_config.__name__, 'initializing hostname config started') for device in devices: config_lines = [f'set system host-name {device.hostname}'] write_device_config_to_file(device.hostname, config_lines) log_message(generate_hostname_config.__name__, 'initializing hostname config finished')
def get_job_detail(job_id): """ :param job_id: Either a job_id string, or job_id.task_array_index string :return: dictionary key: task_id, value: JobDetail object for the running task, or None if job is gone """ try: tree, str_xml = run_qstat(['-j', job_id]) except Exception, e: logging.log_message("Exception getting job detail for " + job_id + ": " + str(e)) return None
def get_multiple_running_job_detail(lst_job_id): """ :param lst_job_id: a list of jobs to query :return: dictionary key: job_id, value: dictionary with key: task_id, value: JobDetail """ try: jobs = ",".join(lst_job_id) tree, str_xml = run_qstat(['-j', jobs]) except Exception, e: logging.log_message("Exception getting job detail for " + jobs + ": " + str(e)) return None
def generate_links_config(links): '''Generate link.cfg with the following format: DeviceA interfaceA DeviceB interfaceB ''' log_message(generate_links_config.__name__, "generate links config started") links_config = [ f"{link.endpoint_a_end.device.hostname} {link.endpoint_a_end.interface.name} {link.endpoint_z_end.device.hostname} {link.endpoint_z_end.interface.name}" for link in links ] write_system_config_to_file(OUTPUT_LINKS_CFG, links_config) log_message(generate_links_config.__name__, "generate links config finished")
def read_lines_from_file(filename): '''Given a filename, returns a list with all the non-empty and uncommented lines, stripped''' log_message(read_lines_from_file.__name__, 'file reading started') file_handler = open(filename, errors='ignore', encoding='utf-8-sig') lines = [] for line in file_handler: line = line.strip() if line and not line.startswith('#'): lines.append(line) file_handler.close() log_message(read_lines_from_file.__name__, 'file reading completed') if not lines: #check for empty file interrupt_execution_with_errors(read_lines_from_file.__name__, f'{filename} is empty') return lines
def generate_namemap_config(devices): '''Generate namemap with the following format: VMID DEVICE_HOSTNAME VMID starts from VM_ID_START set in constants and then incremented by one unit ''' log_message(generate_namemap_config.__name__, "generate namemap config started") config = [] id = VM_ID_START devices_sorted = sorted(devices, key=lambda e: e.hostname) for device in devices_sorted: config.append(f"{id} {device.hostname}") id = id + 1 write_system_config_to_file(OUTPUT_NAMEMAP, config) log_message(generate_namemap_config.__name__, "generate namemap config finished")
def cpu_hog(job_detail, hog_threshold): if job_detail.start_time_secs is None: return None wallclock_secs = time.time() - job_detail.start_time_secs if job_detail.cpu_secs is None: logging.log_message("Could not get cpu time for ", job_detail.job_and_task) return 0 elif job_detail.slots is None: logging.log_message("Could not get slots for ", job_detail) else: hog_amount = (job_detail.cpu_secs / float(wallclock_secs)) - job_detail.slots if hog_amount >= hog_threshold: return hog_amount else: return None
def cpu_starved(job_detail, wallclock_threshold_minutes, cpu_fraction): if job_detail.start_time_secs is None: return None wallclock_secs = time.time() - job_detail.start_time_secs if wallclock_secs < wallclock_threshold_minutes * 60: # job has not been running long enough return None elif job_detail.cpu_secs is None: logging.log_message("Could not get cpu time for ", job_detail.job_and_task) return 0 else: fraction = job_detail.cpu_secs / float(wallclock_secs) if fraction <= cpu_fraction: return fraction else: return None
def parse_running_job_resource_info(job_xml_tree): """ Get info about all running jobs, including resource info (-r) and project (-ext) :param job_xml_tree: output of qstat -xml -j ... :return: list of JobResourceInfo """ ancestor_finder = uge_functions.XmlAncestorFinder(job_xml_tree) lst_ret = [] for task_number in job_xml_tree.findall("*//JAT_task_number"): task = ancestor_finder.get_ancestor(task_number) job = ancestor_finder.get_ancestor(task, 2) job_id = job.findtext('JB_job_number') task_id = task_number.text if task.find("JAT_granted_destin_identifier_list") is None: logging.log_message("Strange XML for running job (%s) task (%s)" % (job_id, task_id)) continue slots_str = task.findtext('*//JG_slots') slots = int(slots_str) h_vmem_node = job.find( "JB_hard_resource_list/element/[CE_name='h_vmem']") if h_vmem_node is None: memory = default_h_vmem else: memory = h_vmem_node.findtext("CE_stringval") h_rt_node = job.find("JB_hard_resource_list/element/[CE_name='h_rt']") h_rt = parse_h_rt(h_rt_node.findtext( "CE_stringval")) if h_rt_node is not None else None queue = job.findtext('*//QR_name', default_queue) lst_ret.append( JobResourceInfo( job_and_task=uge_functions.JobAndTask(job_id=job_id, task_id=task_id), project=job.findtext('JB_project'), slots=slots, queue=queue, memory=slots * uge_functions.dehumanize_memory(memory), user=job.findtext('JB_owner'), host=task.findtext("*//JG_qhostname"), state="running", submit_time=float(job.findtext('JB_submission_time')) / 1000, start_time=float(task.findtext('JAT_start_time')) / 1000, priority=None, tickets=None, h_rt=h_rt)) return lst_ret
def read_devices(): '''Read Devices from FILE_DEVICES and return a list with hostnames Checks implemented: no duplicates ''' log_message(read_devices.__name__, 'device reading started') devices = [] devices_read = read_lines_from_file(FILE_DEVICES) #check for duplicates if not len(devices_read) == len(set(devices_read)): interrupt_execution_with_errors(read_devices.__name__, 'duplicated hostnames found') for hostname in devices_read: device = Device(hostname) devices.append(device) log_message(read_devices.__name__, 'device reading completed') return devices
def generate_ip_address_config(devices): '''Generates interface configuration for each file''' log_message(generate_ip_address_config.__name__, "generate ip address config started") for device in devices: interfaces = device.interfaces_phy + device.interfaces_lo ip_config_lines = [ f"set interfaces {interface.name} unit 0 family inet address {interface.ip_address.with_prefixlen}" for interface in interfaces ] desc_config_lines = [ f"set interfaces {interface.name} description {interface.description}" for interface in interfaces ] config_lines = ip_config_lines + desc_config_lines write_device_config_to_file(device.hostname, config_lines) log_message(generate_ip_address_config.__name__, "generate ip address config finished")
def count_starved_jobs_by_host(dct_strange_state_jobs, job_per_host_verbosity_threshold=None, host_per_job_verbosity_threshold=None): # Count the unique jobs on a host that are cpu-starved. # Multiple tasks for the same job on the same host count as 1. # The idea is that a job could look cpu-starved if it is waiting for something, e.g. downloading something # over the network. If a host is in trouble, there should be multiple jobs that appear cpu-starved. # Also, count the number of hosts on which one of the tasks for a job is cpu-starved. If there are several, # it is probably the job and not a host in trouble. # returns tuple(dict(host=>count of unique starved jobs), dict(job_id=>count of unique hosts on which a task for that job is starved) dct_host_starved_jobs = { } # key: host; value: set of job IDs (not job_and_task) dct_starved_job_hosts = { } # key: job ID, value: set of hosts on which that job has a starved task for job_and_task, problem_state in dct_strange_state_jobs.items(): if problem_state.state.startswith("cpu-starved"): dct_host_starved_jobs.setdefault(problem_state.host, set()).add(job_and_task.job_id) dct_starved_job_hosts.setdefault(job_and_task.job_id, set()).add(problem_state.host) if job_per_host_verbosity_threshold is not None: for host, jobs in dct_host_starved_jobs.iteritems(): if len(jobs) >= job_per_host_verbosity_threshold: logging.log_message(host + " has cpu-starved jobs " + ", ".join(jobs)) if host_per_job_verbosity_threshold is not None: for job, hosts in dct_starved_job_hosts.iteritems(): if len(hosts) >= host_per_job_verbosity_threshold: logging.log_message(job + " is cpu-starved on hosts " + ", ".join(hosts)) return (dict([(host, len(jobs)) for host, jobs in dct_host_starved_jobs.items()]), dict([(job_id, len(hosts)) for job_id, hosts in dct_starved_job_hosts.items()]))
def sigint_handler(signum, frame): """Log termination cleanly""" logging.log_message( "Exiting after received signal %d\n%s" % (signum, "".join( traceback.format_list(traceback.extract_stack(frame))))) sys.exit()
def main(args=None): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( "--verbose", "-v", action="count", default=0, help= "Write progress to stderr. Use this option multiple times for more verbosity" ) parser.add_argument( "--job", "-j", action="append", help="Job ID(s) for which insight into pending state is desired") parser.add_argument( "--pending-detail", action="store_true", default=False, help="Print detail about jobs that have jumped in from of query job") parser.add_argument( "--ignore-job", "-i", action="append", help="Job ID(s) to ignore when -j is specified. " "This is sometimes necessary to work around bad UGE XML for some jobs", default=[]) parser.add_argument( "--ignore-project", "-p", action="append", help="Projects to ignore when finding all extant shares. " "This is sometimes necessary because of unused projects", default=[]) parser.add_argument("--ignore-queue", "-q", action="append", help="Queues to ignore when counting healthy hosts. " "This is sometimes necessary because of unused queues", default=[]) parser.add_argument( "--memory-users", "-m", type=int, default=10, help= "Print user stats for the top N users of memory. Default: %(default)s" ) parser.add_argument( "--slot-users", "-s", type=int, default=10, help= "Print user stats for the top N users of slots. Default: %(default)s") parser.add_argument( "--mem-histogram-binsize", type=float, default=10 * 1024 * 1024 * 1024.0, help="Bin size for available memory histogram. Default: %(default)s") parser.add_argument("--nocolor", default=False, action='store_true', help="Do not use ANSI colors in output") parser.add_argument( "--write-snapshot", "-w", help="Write a tar file containing the output of the various Grid Engine " "commands executed") parser.add_argument( "--read-snapshot", "-r", help= "Instead of executing Grid Engine commands to obtain current state, " "read snapshot tar file containing a past state") options = parser.parse_args(args) if options.read_snapshot is not None: logging.log_message("Loading Grid Engine snapshot from " + options.read_snapshot, level=1, verbosity=options.verbose) uge_state = uge_functions.uge_snapshot( snapshot_tar=options.read_snapshot) else: logging.log_message("Taking snapshot of current Grid Engine state", level=1, verbosity=options.verbose) uge_state = uge_functions.uge_snapshot() if options.write_snapshot is not None: logging.log_message("Writing Grid Engine snapshot to " + options.write_snapshot, level=1, verbosity=options.verbose) uge_state.save(options.write_snapshot) logging.log_message("Getting host info", level=1, verbosity=options.verbose) dct_hosts = uge_functions.parse_host_info( qhost_tree=uge_state.get_xml(uge_state.uge_snapshot_hosts), queues_tree=uge_state.get_xml(uge_state.uge_snapshot_queues), queues_to_ignore=options.ignore_queue) lst_hosts = dct_hosts.values() logging.log_message("Getting job info", level=1, verbosity=options.verbose) running_jobs = parse_running_job_resource_info( uge_state.get_xml(uge_state.uge_snapshot_running_jobs)) all_hosts_report_line = make_report_line("all hosts", lst_hosts) lst_report = [all_hosts_report_line] lst_report.extend( group_and_report( lst_hosts, key_fun=lambda _host: _host.state, key_prefix="hosts in state ", key_filter=lambda state: state is not None and len(state) > 0)) logging.log_message("Getting project info", level=1, verbosity=options.verbose) projects = [ uge_state.get_project_info(project) for project in uge_state.get_projects() if project not in options.ignore_project ] total_shares = sum([int(project.fshare) for project in projects]) dct_shares_by_project = dict([(project.name, project.fshare) for project in projects]) lst_report.append( make_report_line("available resources on healthy hosts", [ host._replace(slots=host.slots - host.slots_used, memory=host.memory - host.mem_used) for host in lst_hosts if host.state is None and host.slots_used is not None and host.mem_used is not None ])) lst_report.extend( report_by_queue_project_and_user( running_jobs, "running jobs", dct_shares_by_project, top_n_memory_users=options.memory_users, top_n_slot_users=options.slot_users)) # Add the percentages lst_report = [ report._replace( pct_slots=int(report.slots * 100 / all_hosts_report_line.slots), pct_memory=int(report.memory * 100 / all_hosts_report_line.memory), pct_shares=None if report.shares is None else int( int(report.shares) * 100 / total_shares)) for report in lst_report ] # Figure out which project(s) have gotten more slots or RAM than their percentage of shares # Leading Falses corresponds to the 2 header lines lst_hog = [False, False] + \ [report.pct_shares is not None and ( report.pct_memory > report.pct_shares or report.pct_slots > report.pct_shares) for report in lst_report] print "# RESOURCES ALLOCATED TO RUNNING JOBS\n" lst_rows = [('class', 'slots', 'memory', 'pct_slots', 'pct_memory', 'pct_shares')] + \ [(report.label, str(report.slots), uge_functions.humanize_memory(report.memory), str(report.pct_slots) + '%', str(report.pct_memory) + '%', '' if report.pct_shares is None else str(report.pct_shares) + '%') for report in lst_report] if options.nocolor: clear_ansi_color = '' start_ansi_red = '' else: clear_ansi_color = '\033[0m' start_ansi_red = '\033[31m' for line, do_color in zip( tabularize.format_table( lst_rows, right_justify=[False, True, True, True, True, True]), lst_hog): if do_color: print start_ansi_red + line + clear_ansi_color else: print line print "\n# SHARES BY PROJECT\n" lst_rows = [('project', 'shares', 'pct_shares')] + \ [(project.name, str(project.fshare), str(int(int(project.fshare) * 100 / total_shares)) + '%') for project in projects] for line in tabularize.format_table(lst_rows, right_justify=[False, True, True]): print line print "\n# HISTOGRAM OF AVAILABLE SLOTS\n" slots_histogram = histogram_tools.make_histogram([ host.slots - host.slots_used for host in lst_hosts if host.slots_used is not None and host.state is None ], 1) lst_rows = [("available_slots", "num_hosts", "num_hosts >= this bin")] + \ [(str(slots), str(hosts), str(hosts_ge)) for slots, hosts, hosts_ge in slots_histogram] for line in tabularize.format_table(lst_rows, right_justify=[True, True, True], column_sep=" "): print line print "\n# HISTOGRAM OF AVAILABLE MEMORY\n" slots_histogram = histogram_tools.make_histogram([ host.memory - host.mem_used for host in lst_hosts if host.mem_used is not None and host.state is None ], options.mem_histogram_binsize) lst_rows = [("available_memory", "num_hosts", "num_hosts >= this bin")] + \ [(uge_functions.humanize_memory(mem, precision=0) + " <= mem < " + uge_functions.humanize_memory(mem + options.mem_histogram_binsize, precision=0), str(hosts), str(hosts_ge)) for mem, hosts, hosts_ge in slots_histogram] for line in tabularize.format_table(lst_rows, right_justify=[True, True, True], column_sep=" "): print line if options.job is not None and len(options.job) > 0: logging.log_message("Getting start times of running jobs", level=1, verbosity=options.verbose) pending_jobs_resource_info = get_pending_jobs_resource_info( uge_state.get_xml(uge_state.uge_snapshot_jobs)) for pending_job in options.job: resource_info_list = filter( lambda _job: _job.job_and_task.job_id == pending_job, pending_jobs_resource_info) if len(resource_info_list) == 0: print "\n# %s is not a pending job" % pending_job continue resource_info = resource_info_list[0] pending_job_submit_time = resource_info.submit_time queue_jumper_resource_infos = [ job_info for job_info in running_jobs if job_info.submit_time > pending_job_submit_time ] lst_report = report_by_queue_project_and_user( queue_jumper_resource_infos, "queue jumpers", dct_shares_by_project, top_n_memory_users=options.memory_users, top_n_slot_users=options.slot_users) print "\n# QUEUE JUMPERS for job %s. user %s, queue %s, project %s, slots %d, memory %s, tickets %s, h_rt %s\n" % ( resource_info.job_and_task.job_id, resource_info.user, resource_info.queue, resource_info.project, resource_info.slots, uge_functions.humanize_memory( resource_info.memory), resource_info.tickets, uge_functions.humanize_seconds(resource_info.h_rt)) lst_rows = [('class', 'slots', 'memory', 'pct_slots', 'pct_memory')] + \ [(report.label, str(report.slots), uge_functions.humanize_memory(report.memory), str(int(report.slots * 100 / all_hosts_report_line.slots)) + '%', str(int(report.memory * 100 / all_hosts_report_line.memory)) + '%') for report in lst_report] for line in tabularize.format_table( lst_rows, right_justify=[False, True, True, True, True]): print line print "\n# HISTOGRAM OF SLOT RESERVATIONS FOR QUEUE JUMPERS for job %s\n" % pending_job queue_jumper_slots_histo = histogram_tools.make_histogram( [job_info.slots for job_info in queue_jumper_resource_infos], 1) lst_rows = [("slots_used", "num_queue_jumper_jobs", "num_queue_jumper_jobs >= this bin")] + \ [(str(slots), str(jobs), str(jobs_ge)) for slots, jobs, jobs_ge in queue_jumper_slots_histo] for line in tabularize.format_table( lst_rows, right_justify=[True, True, True], column_sep=" "): print line lst_available_hosts = [ (host.host, host.slots - host.slots_used, host.memory - host.mem_used) for host in lst_hosts if host.state is None and host.mem_used is not None and host.slots - host.slots_used >= resource_info.slots and host.memory - host.mem_used >= resource_info.memory and resource_info.queue in host.queues ] # Sort by (available_slots, available_memory) lst_available_hosts.sort(key=lambda _tup: _tup[1:]) # Convert to strings for printing lst_available_hosts = [(tup[0], str(tup[1]), uge_functions.humanize_memory(tup[2])) for tup in lst_available_hosts] print "\n# %d HEALTHY HOSTS WITH AT LEAST %d SLOTS AND %s MEMORY IN %s QUEUE FOR JOB %s\n" % \ (len(lst_available_hosts), resource_info.slots, uge_functions.humanize_memory(resource_info.memory), resource_info.queue, pending_job) lst_rows = [("host", "available_slots", "available_memory") ] + lst_available_hosts for line in tabularize.format_table( lst_rows, right_justify=[False, True, True], column_sep=" "): print line # report on pending jobs with more tickets than query job lst_high_ticket_pending_jobs = [ job_info for job_info in pending_jobs_resource_info if job_info.tickets > resource_info.tickets ] print "\n# Pending jobs with more tickets than job " + pending_job lst_rows = [('class', 'slots', 'memory')] + \ [(report.label, str(report.slots), uge_functions.humanize_memory(report.memory)) for report in (report_by_queue_project_and_user(lst_high_ticket_pending_jobs, "pending jobs", dct_shares_by_project))] for line in tabularize.format_table( lst_rows, right_justify=[False, True, True]): print line print "\n# Pending jobs with more tickets than job " + pending_job + " but submitted after" lst_high_ticket_pending_jumpers = [ job_info for job_info in lst_high_ticket_pending_jobs if job_info.submit_time > resource_info.submit_time ] lst_rows = [('class', 'slots', 'memory')] + \ [(report.label, str(report.slots), uge_functions.humanize_memory(report.memory)) for report in (report_by_queue_project_and_user(lst_high_ticket_pending_jumpers, "pending high-ticket queue jumpers", dct_shares_by_project))] for line in tabularize.format_table( lst_rows, right_justify=[False, True, True]): print line if options.pending_detail: # get tickets for running jobs queue_jumper_resource_infos = replace_tickets( queue_jumper_resource_infos, uge_state.get_xml(uge_state.uge_snapshot_jobs)) # Both these tables are in descending order by tickets queue_jumper_resource_infos.sort( cmp=lambda x, y: cmp(y.tickets, x.tickets)) lst_high_ticket_pending_jumpers.sort( cmp=lambda x, y: cmp(y.tickets, x.tickets)) lst_header = [ 'job', 'task', 'user', 'project', 'slots', 'memory', 'tickets', 'h_rt', 'submitted', 'dispatched' ] print "\n# Running queue jumper details" lst_rows = [lst_header] + [ (job.job_and_task.job_id, job.job_and_task.task_id, job.user, job.project, str(job.slots), uge_functions.humanize_memory(job.memory), str( job.tickets), uge_functions.humanize_seconds( job.h_rt), format_time( job.submit_time), format_time(job.start_time)) for job in queue_jumper_resource_infos ] for line in tabularize.format_table(lst_rows, right_justify=[ False, True, False, False, True, True, True, True, False, False ]): print line lst_header = [ 'job', 'user', 'project', 'slots', 'memory', 'tickets', 'h_rt', 'submitted' ] print "\n# Pending queue jumper details" lst_rows = [lst_header] + [ (job.job_and_task.job_id, job.user, job.project, str(job.slots), uge_functions.humanize_memory(job.memory), str(job.tickets), uge_functions.humanize_seconds( job.h_rt), format_time(job.submit_time)) for job in lst_high_ticket_pending_jumpers ] for line in tabularize.format_table(lst_rows, right_justify=[ False, False, False, True, True, True, True, False ]): print line
def parse_host_info(qhost_tree, queues_tree, queues_to_ignore=[]): """ :return: dictionary key: host, value HostInfo """ dctRet = {} for host_node in qhost_tree.findall('host'): host_name = host_node.get('name') dct_hostvalues = dict([ (hostvalue_node.get('name'), hostvalue_node.text) for hostvalue_node in host_node.findall('hostvalue') ]) if dct_hostvalues['num_proc'] != '-': slots = int(dct_hostvalues['num_proc']) slots_used = sum([ int(slots_used_node.text) for slots_used_node in host_node.findall(".//queuevalue[@name='slots_used']") ]) memory = dehumanize_memory(dct_hostvalues['mem_total']) mem_used = 0 if dct_hostvalues[ 'mem_used'] == '-' else dehumanize_memory( dct_hostvalues['mem_used']) dctRet[host_name] = HostInfo(host=host_name, slots=slots, memory=memory, state=None, slots_used=slots_used, mem_used=mem_used, queues=set()) else: dctRet[host_name] = HostInfo(host=host_name, slots=None, memory=None, state=None, slots_used=None, mem_used=None, queues=set()) for queue_info in queues_tree.findall('*/Queue-List'): state = queue_info.findtext('state') if state is None: state = '' # Ignore suspended state state = re.sub('s', '', state) # Ignore configuration ambiguous state state = re.sub('c', '', state) # If disabled, ignore other state flags, because they can vary between queues on a host if 'd' in state: state = 'd' queue = queue_info.findtext('name') queue_split = queue.split('@', 1) host = queue_split[1] queue_name = queue_split[0] if queue_name in queues_to_ignore: continue host_info = dctRet.get(host) host_info.queues.add(queue_name) if len(state) > 0: if host_info is None: logging.log_message(host + " found in qstat but not qhost") elif host_info.state is None: dctRet[host] = host_info._replace(state=state) elif not is_host_state_compatible(host_info.state, state): raise Exception("Conflicting states for %s: %s != %s" % (host, host_info.state, state)) return dctRet
def main(args=None): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( "--email", "-e", action="append", help= "email address(es) to which report should be sent. Default: write report to stdout" ) parser.add_argument( "--user", "-u", action="append", help="User(s) whose jobs should be checked. Default: all users") parser.add_argument( "--user-file", type=read_list_file, help= "File containing list of users whose jobs should be checked, one per line." ) parser.add_argument("--exclude-user", action="append", help="User(s) whose jobs should not be checked.") parser.add_argument( "--exclude-user-file", type=read_list_file, help= "File containing list of users whose jobs should not be checked, one per line." ) parser.add_argument("--ignore-job", "-j", action="append", help="Job ID(s) that should be ignored") parser.add_argument( "--ignore-job-file", type=read_list_file, help= "File containing list of job ID(s) that should be ignored, one per line" ) parser.add_argument("--ignore-host", action="append", help="Host(s) that should be ignored") parser.add_argument( "--ignore-host-file", type=read_list_file, help= "File containing list of host(s) that should be ignored, one per line") parser.add_argument( "--sleep-mins", "-s", type=float, default=60, help= "Time (in minutes) to sleep between status checks. Default: %(default)s" ) parser.add_argument( "--heartbeat-mins", type=float, default=24 * 60, help= "Time (in minutes) to sleep between sending message even if no problems. " "Default: %(default)s") parser.add_argument( "--delete-mins", "-d", type=float, default=30, help= "A job in deletion state for more than this time (in minutes) is treated as a problem. " "Default: %(default)s") parser.add_argument( "--transfer-mins", "-t", type=float, default=30, help= "A job in transfer state for more than this time (in minutes) is treated as a problem. " "Default: %(default)s") parser.add_argument( "--cpu_fraction", "-c", type=float, default=0.0001, help= "A job with cpu time/wallclock time < this value is considered cpu-starved. " "Default: %(default)s") parser.add_argument( "--starved-jobs-per-host", type=int, default=2, help= "A cpu-starved job is not reported unless there are at least this many cpu-starved jobs" " on the host. Multiple task for the same job are counted as one. Default: %(default)s" ) parser.add_argument( "--starved-task-array-hosts", type=int, default=4, help= "If a task array has cpu-starved jobs on at least this many hosts, it is assumed that " "the job is waiting, rather than that the host is starving it of cpu. Default: %(default)s" ) parser.add_argument( "--wallclock_threshold", "-w", type=float, default=3, help= "A job is not considered cpu-starved until its wallclock time (minutes) is >= this value. " "Default: %(default)s") parser.add_argument( "--cpu-hog", type=float, default=2.0, help= "A job with (cpu time/wallclock time) - slots > this value is considered a cpu hog" "Default: %(default)s") parser.add_argument( "--verbose", "-v", action="count", default=0, help= "Write progress to stderr. Use this option multiple times for more verbosity" ) parser.add_argument( "--smtp", default="smtp.broadinstitute.org", help="SMTP host for sending mail. Default: %(default)s") parser.add_argument( "--name", "-n", default="UGER", help= "Name of batch system to be used in messages. Default: %(default)s") # key: host; value: time host failed ping dct_no_ping_hosts = {} # key: host; value: time bad state first seen dct_unreachable_queue_hosts = {} # key: JobAndTask; value: ProblemState(state, time state first seen, time reported) dct_strange_state_jobs = {} last_summary_time = time.time() signal.signal(signal.SIGINT, sigint_handler) options = parser.parse_args(args) # Do not send more than one exception message every 30 minutes, to avoid email avalanche mail_throttler = email_notify.MailThrottler( smtp=options.smtp, recipients=options.email, throttle=30 * 60, subject="UGER monitor unhandled exception") logging.log_message("Starting daemon") while True: try: # parse options again every time so that any files are reloaded options = parser.parse_args(args) logging.log_message("Options: " + str(options), 1, options.verbose) options.user = combine_option_lists(options.user, options.user_file, ["*"]) excluded_users = frozenset( combine_option_lists(options.exclude_user, options.exclude_user_file)) ignored_jobs = frozenset( combine_option_lists(options.ignore_job, options.ignore_job_file)) ignored_hosts = frozenset( combine_option_lists(options.ignore_host, options.ignore_host_file)) logging.log_message("Getting job list", 1, options.verbose) # remove jobs to be ignored jobs = [ job for job in uge_functions.get_job_list(options.user) if job.user not in excluded_users and job.host is not None and job.host not in ignored_hosts and job.job_and_task.job_id not in ignored_jobs ] logging.log_message("Clearing strange jobs that are gone", 1, options.verbose) # Remove strange jobs that were not returned in job list current_job_set = set([job.job_and_task for job in jobs]) for strange_job in dct_strange_state_jobs.keys(): if strange_job not in current_job_set: remove_strange_job(dct_strange_state_jobs, strange_job) # key: host; value: list of problem strings dct_problems = {} logging.log_message("Pinging hosts", 1, options.verbose) for newly_failed_host in ping_hosts( set(dct_no_ping_hosts.keys() + [job.host for job in jobs]), dct_no_ping_hosts): dct_problems.setdefault(newly_failed_host, []).append("did not respond to ping") logging.log_message("Checking queue states", 1, options.verbose) for newly_failed_host in check_host_queue_state( set(dct_unreachable_queue_hosts.keys() + [job.host for job in jobs]), dct_unreachable_queue_hosts): dct_problems.setdefault( newly_failed_host, []).append("has queue in unreachable state") running_jobs = [] logging.log_message("Checking job state", 1, options.verbose) transfer_state = "transfer" deletion_state = "deletion" for job in jobs: if is_transfer_state(job.state): update_strange_jobs(dct_strange_state_jobs, job, transfer_state) elif is_delete_state(job.state): update_strange_jobs(dct_strange_state_jobs, job, deletion_state) elif is_running_state(job.state): running_jobs.append(job) problem_state = dct_strange_state_jobs.get( job.job_and_task) if problem_state is not None and \ (problem_state.state == transfer_state or problem_state.state == deletion_state): logging.log_message( "Job %s leaving strange state %s and now running" % (str(job), dct_strange_state_jobs[job.job_and_task]), 1, options.verbose) remove_strange_job(dct_strange_state_jobs, job.job_and_task) elif job.job_and_task in dct_strange_state_jobs: # job not in running state anymore logging.log_message("Odd job state " + str(job), 1, options.verbose) remove_strange_job(dct_strange_state_jobs, job.job_and_task) logging.log_message("Checking cpu usage", 1, options.verbose) # Group by requires input is sorted in order to be grouped running_jobs.sort(cmp=lambda job1, job2: cmp( (job1.job_and_task.job_id, job1.job_and_task.task_id), (job2.job_and_task.job_id, job2.job_and_task.task_id))) for ignore_key, job_group in itertools.groupby( running_jobs, key=lambda j: j.job_and_task.job_id): # convert from iterable to list, because the first element is accessed twice jobs_in_group = list(job_group) if jobs_in_group[0].queue == "interactive": # Interactive jobs can appear cpu-starved continue job_id = jobs_in_group[0].job_and_task.job_id logging.log_message("Getting details for " + str(job_id), 2, options.verbose) dct_job_details = uge_functions.get_job_detail(job_id) if dct_job_details is None: # all task for job appear to be gone for job in jobs_in_group: remove_strange_job(dct_strange_state_jobs, job.job_and_task) else: for job in jobs_in_group: job_details = dct_job_details.get( job.job_and_task.task_id) if job_details is None: # this job and task is done remove_strange_job(dct_strange_state_jobs, job.job_and_task) else: cpu_fraction = cpu_starved( job_details, options.wallclock_threshold, options.cpu_fraction) if cpu_fraction is not None: update_strange_jobs( dct_strange_state_jobs, job, "cpu-starved %f" % cpu_fraction, job_details) else: cpu_hog_amount = cpu_hog( job_details, options.cpu_hog) if cpu_hog_amount is not None: update_strange_jobs( dct_strange_state_jobs, job, "cpu-hog %f" % cpu_hog_amount, job_details) report_time_threshold = time.time() - 60 * options.sleep_mins deletion_time_threshold = time.time() - 60 * options.delete_mins transfer_time_threshold = time.time() - 60 * options.transfer_mins dct_count_starved_jobs_by_host, dct_starved_job_host_count = \ count_starved_jobs_by_host(dct_strange_state_jobs, options.starved_jobs_per_host if options.verbose > 1 else None, options.starved_task_array_hosts if options.verbose > 1 else None) for job_and_task, problem_state in dct_strange_state_jobs.items(): if problem_state.reported_time is None: if problem_state.state == deletion_state: report_problem = problem_state.first_seen_time <= deletion_time_threshold elif problem_state.state == transfer_state: report_problem = problem_state.first_seen_time <= transfer_time_threshold elif problem_state.state.startswith("cpu-starved"): report_problem = problem_state.first_seen_time <= report_time_threshold and \ dct_count_starved_jobs_by_host[problem_state.host] >= options.starved_jobs_per_host and \ dct_starved_job_host_count[job_and_task.job_id] < options.starved_task_array_hosts else: report_problem = problem_state.first_seen_time <= report_time_threshold if report_problem: dct_problems.setdefault(problem_state.host, []).append( format_job_problem(job_and_task, problem_state)) dct_strange_state_jobs[ job_and_task] = problem_state._replace( reported_time=time.time()) # Generate message for all problems even when only reporting new problems. do_heartbeat = time.time( ) - last_summary_time >= 60 * options.heartbeat_mins if len(dct_problems) > 0 or do_heartbeat: last_summary_time = time.time() all_problems = {} for host in dct_no_ping_hosts.keys(): all_problems.setdefault( host, []).append("did not respond to ping") for host in dct_unreachable_queue_hosts.keys(): all_problems.setdefault( host, []).append("has queue in unreachable state") for job_and_task, problem_state in dct_strange_state_jobs.items( ): if problem_state.reported_time is not None: all_problems.setdefault(problem_state.host, []). \ append(format_job_problem(job_and_task, problem_state)) if len(dct_problems) > 0: message = "The following problems have been detected since the last check:\n\n" + \ format_problems(dct_problems) + \ "\n\nAll outstanding problems:\n\n" + format_problems(all_problems) logging.log_message(message) if options.email is not None: email_notify.email_report( message, "New " + options.name + " problems", options.email, options.smtp) if do_heartbeat: message = "Previously reported problems that have not cleared:\n\n" + format_problems( all_problems) logging.log_message(message) if options.email is not None: email_notify.email_report( message, "All " + options.name + " problems", options.email, options.smtp) logging.log_message("Sleeping for %s minutes" % options.sleep_mins, 1, options.verbose) time.sleep(options.sleep_mins * 60) except Exception, e: exc_str = traceback.format_exc() logging.log_message("Looping after unhandled exception: " + exc_str) mail_throttler.add_message(exc_str)
def write_system_config_to_file(system_config_file, config): log_message(write_lines_to_file.__name__, 'write system config started') filename = "{}{}.cfg".format(OUTPUT_SYSTEM_CONFIG, system_config_file) write_lines_to_file(filename, config) log_message(write_lines_to_file.__name__, 'write system config completed')
def write_device_config_to_file(device, config): log_message(write_lines_to_file.__name__, 'write device config started') filename = "{}{}.cfg".format(OUTPUT_DEVICE_CONFIG, device) write_lines_to_file(filename, config) log_message(write_lines_to_file.__name__, 'write device config completed')
def main(args=None): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--job-duration", "-j", type=float, default=60.0, help= "Count jobs that die within this many seconds. Default: %(default)s") parser.add_argument("--num-failed-jobs", "-n", type=int, default=5, help="Report if at least this many distinct jobs " "(as opposed to multiple tasks for the same job)" " die on the same host. Default: %(default)s") parser.add_argument( "--interval", "-i", type=float, default=300.0, help= "Report if enough jobs failed on the same host within this many seconds. " "Default: %(default)s") if full_accounting_path is None: parser.add_argument( "--accounting", "-a", required=True, help="Grid Engine accounting file to monitor. " "This is required if SGE_ROOT and SGE_CELL environment variables are not set. " "Load a Grid Engine dotkit to set these environment variables.") else: parser.add_argument( "--accounting", "-a", default=full_accounting_path, help="Grid Engine accounting file to monitor. Default: %(default)s" ) parser.add_argument( "--email", "-e", action="append", help= "email address(es) to which report should be sent. Default: only write report to stdout" ) parser.add_argument( "--smtp", default="smtp.broadinstitute.org", help="SMTP host for sending mail. Default: %(default)s") parser.add_argument( "--email-interval", type=int, default=300, help= "Send an email message no more frequently than this many seconds, in order to avoid " "swamping mail server. Default: %(default)s") parser.add_argument( "--heartbeat-interval", type=int, default=60 * 60 * 24, help= "Send an email message after this many seconds, to make sure the daemon is alive. " "Default: %(default)s") parser.add_argument( "--verbose", "-v", action="count", default=0, help= "Write progress to stderr. Use this option multiple times for more verbosity" ) options = parser.parse_args(args) logging.log_message(str(options)) mail_throttler = email_notify.MailThrottler( smtp=options.smtp, recipients=options.email, throttle=options.email_interval, subject="UGER black hole hosts") last_heartbeat = time.time() # bufsize=-1 => 'fully buffered' proc = subprocess.Popen(['tail', '-F', options.accounting], stdout=subprocess.PIPE, bufsize=-1) # Skip first line which may be truncated proc.stdout.readline() dct_by_host = dict() for job in generate_accounting_lines(proc.stdout): now = time.time() if now - last_heartbeat >= options.heartbeat_interval: email_notify.email_report( message="Grid Engine black hole detector is alive", subject="Grid Engine black hole heartbeat", recipients=options.email, smtp_host=options.smtp) logging.log_message("Grid Engine black hole detector is alive") last_heartbeat = now mail_throttler.maybe_send() logging.log_message("Terminated job: " + str(job), level=3, verbosity=options.verbose) if job.exit_status >= 128 and job.duration <= options.job_duration and job.deleted_by is None: b_potential_black_hole = False logging.log_message("Failed job " + str(job), level=2, verbosity=options.verbose) update_dict(dct_by_host, job, age_out_interval=options.interval * 2) for host in dct_by_host.iterkeys(): dct_host = dct_by_host[host] if len(dct_host) >= options.num_failed_jobs: b_potential_black_hole = True logging.log_message("potential black hole " + host, level=1, verbosity=options.verbose) sorted_jobs = sorted(dct_host.itervalues(), key=lambda j: j.end_time) for i in xrange(0, len(sorted_jobs) - options.num_failed_jobs): if sorted_jobs[i+options.num_failed_jobs].end_time - sorted_jobs[i].end_time <= \ options.interval: black_hole_message = "BLACK HOLE: %s has %d failed jobs" % ( host, len(sorted_jobs)) logging.log_message(black_hole_message, file=sys.stdout) logging.log_message(black_hole_message) mail_throttler.add_message(black_hole_message) break if b_potential_black_hole: logging.log_message("Current failed job state: " + format_failed_jobs_dict(dct_by_host), level=2, verbosity=options.verbose)
def read_topology(devices_valid): '''Read from FILE_PHY_INTERFACES, updates Device info and return topology Check implemented: no duplicated endpoint no physical loopback (cable connected to same device) endpoint using valid device ''' log_message(read_topology.__name__, 'physical interfaces reading started') lines = read_lines_from_file(FILE_PHY_INTERFACES) devices = [] endpoints = [] links = [] devices_valid_dict = {device.hostname: device for device in devices_valid} for line in lines: if ':' in line: #link_id is constructed using ':' fields = line.split(',') link_id = fields[0] network_base_input = fields[1] network_prefix_input = fields[2] a_and_device_input = fields[3] a_end_interface_input = fields[4] a_end_host_ip_input = fields[5] z_and_device_input = fields[6] z_end_interface_input = fields[7] z_end_host_ip_input = fields[8] a_end_device = Device(a_and_device_input) a_and_interface = Interface( a_end_interface_input, f'{network_base_input}.{a_end_host_ip_input}', network_prefix_input) z_end_device = Device(z_and_device_input) z_and_interface = Interface( z_end_interface_input, f'{network_base_input}.{z_end_host_ip_input}', network_prefix_input) a_end_endpoint = Endpoint(a_end_device, a_and_interface) z_end_endpoint = Endpoint(z_end_device, z_and_interface) devices.append(a_end_device) devices.append(z_end_device) endpoints.append(a_end_endpoint) endpoints.append(z_end_endpoint) link = Link(a_end_endpoint, z_end_endpoint) try: devices_valid_dict[ link.endpoint_a_end.device.hostname].interfaces_phy.append( link.endpoint_a_end.interface) devices_valid_dict[ link.endpoint_z_end.device.hostname].interfaces_phy.append( link.endpoint_z_end.interface) except KeyError: print( f"{a_end_device.hostname} and/or {z_end_device.hostname} not found in in devices_valid" ) links.append(link) if not set(devices).issubset(set(devices_valid)): print(f"Invalid devices found:{set(devices) - set(devices_valid)}") interrupt_execution_with_errors(read_topology.__name__, "Invalid device read") if not len(endpoints) == len(set(endpoints)): interrupt_execution_with_errors(read_topology.__name__, "Physical Duplicated endpoint found") log_message(read_topology.__name__, 'physical interfaces reading finished') return links