Esempio n. 1
0
def log(header="", message="", send_to_remote=False):
    event_string = '%s: %s' % (header, message)
    buf = packer.pack_event_v1(event_string)
    logging.log_message(buf)
    if send_to_remote:
        remote_link.send_message(buf)
    return True
Esempio n. 2
0
def read_loopback(devices_valid):
    '''Ready from FILE_LO_INTERFACES, updates Device. No return value
    Check implemented:
        no duplicated endpoint
        endpoint using valid device
    '''
    log_message(read_loopback.__name__, 'looback interfaces reading started')
    lines = read_lines_from_file(FILE_LO_INTERFACES)
    devices = []
    endpoints = []
    devices_valid_dict = {device.hostname: device for device in devices_valid}

    for line in lines:
        fields = line.split(',')
        device = Device(fields[0])
        devices.append(device)
        loopback_interface = LoopbackInterface(fields[1], fields[2])
        endpoint = Endpoint(device, loopback_interface)
        endpoints.append(endpoint)
        try:
            devices_valid_dict[device.hostname].interfaces_lo.append(
                loopback_interface)
        except KeyError:
            print(f"{device.hostname} key not found in devices_valid")

    if not set(devices).issubset(set(devices_valid)):
        print(f"Invalid devices found:{set(devices) - set(devices_valid)}")
        interrupt_execution_with_errors(read_loopback.__name__,
                                        "Invalid device read")

    if not len(endpoints) == len(set(endpoints)):
        interrupt_execution_with_errors(read_loopback.__name__,
                                        "Loopback Duplicated endpoint found")

    log_message(read_loopback.__name__, 'loopback interfaces reading finished')
def finilize_config_files(devices):
    '''Finilize the config file for each device'''
    log_message(finilize_config_files.__name__, 'finalizing config started')
    config = []
    config.append(CONFIG_FINALIZE)
    for device in devices:
        write_device_config_to_file(device.hostname, config)
    log_message(finilize_config_files.__name__, 'finalizing config completed')
Esempio n. 4
0
def write_lines_to_file(filename, lines):
    '''Given a filename and a list, writes each list element as a line in the file'''
    log_message(write_lines_to_file.__name__, 'lines writing started')
    file_handler = open(filename, 'a+')
    lines = map(lambda x: x + '\n', lines)
    file_handler.writelines(lines)
    file_handler.close()
    log_message(write_lines_to_file.__name__, 'lines writing completed')
def generate_hostname_config(devices):
    '''Generates the configuration required to set the hostname
    Hostname is set to device name'''
    log_message(generate_hostname_config.__name__,
                'initializing hostname config started')
    for device in devices:
        config_lines = [f'set system host-name {device.hostname}']
        write_device_config_to_file(device.hostname, config_lines)
    log_message(generate_hostname_config.__name__,
                'initializing hostname config finished')
Esempio n. 6
0
def get_job_detail(job_id):
    """
    :param job_id: Either a job_id string, or job_id.task_array_index string
    :return: dictionary key: task_id, value: JobDetail object for the running task, or None if job is gone
    """
    try:
        tree, str_xml = run_qstat(['-j', job_id])
    except Exception, e:
        logging.log_message("Exception getting job detail for " + job_id +
                            ": " + str(e))
        return None
Esempio n. 7
0
def get_multiple_running_job_detail(lst_job_id):
    """
    :param lst_job_id: a list of jobs to query 
    :return: dictionary key: job_id, value: dictionary with key: task_id, value: JobDetail
    """
    try:
        jobs = ",".join(lst_job_id)
        tree, str_xml = run_qstat(['-j', jobs])
    except Exception, e:
        logging.log_message("Exception getting job detail for " + jobs + ": " +
                            str(e))
        return None
def generate_links_config(links):
    '''Generate link.cfg with the following format:
    DeviceA interfaceA DeviceB interfaceB
    '''
    log_message(generate_links_config.__name__,
                "generate links config started")
    links_config = [
        f"{link.endpoint_a_end.device.hostname} {link.endpoint_a_end.interface.name} {link.endpoint_z_end.device.hostname} {link.endpoint_z_end.interface.name}"
        for link in links
    ]
    write_system_config_to_file(OUTPUT_LINKS_CFG, links_config)
    log_message(generate_links_config.__name__,
                "generate links config finished")
Esempio n. 9
0
def read_lines_from_file(filename):
    '''Given a filename, returns a list with all the non-empty and uncommented lines, stripped'''
    log_message(read_lines_from_file.__name__, 'file reading started')
    file_handler = open(filename, errors='ignore', encoding='utf-8-sig')
    lines = []
    for line in file_handler:
        line = line.strip()
        if line and not line.startswith('#'):
            lines.append(line)
    file_handler.close()
    log_message(read_lines_from_file.__name__, 'file reading completed')
    if not lines:  #check for empty file
        interrupt_execution_with_errors(read_lines_from_file.__name__,
                                        f'{filename} is empty')
    return lines
def generate_namemap_config(devices):
    '''Generate namemap with the following format:
    VMID DEVICE_HOSTNAME
    VMID starts  from VM_ID_START set in constants and then incremented by one unit
    '''
    log_message(generate_namemap_config.__name__,
                "generate namemap config started")
    config = []
    id = VM_ID_START
    devices_sorted = sorted(devices, key=lambda e: e.hostname)
    for device in devices_sorted:
        config.append(f"{id} {device.hostname}")
        id = id + 1
    write_system_config_to_file(OUTPUT_NAMEMAP, config)
    log_message(generate_namemap_config.__name__,
                "generate namemap config finished")
Esempio n. 11
0
def cpu_hog(job_detail, hog_threshold):
    if job_detail.start_time_secs is None:
        return None
    wallclock_secs = time.time() - job_detail.start_time_secs
    if job_detail.cpu_secs is None:
        logging.log_message("Could not get cpu time for ",
                            job_detail.job_and_task)
        return 0
    elif job_detail.slots is None:
        logging.log_message("Could not get slots for ", job_detail)
    else:
        hog_amount = (job_detail.cpu_secs /
                      float(wallclock_secs)) - job_detail.slots
        if hog_amount >= hog_threshold:
            return hog_amount
        else:
            return None
Esempio n. 12
0
def cpu_starved(job_detail, wallclock_threshold_minutes, cpu_fraction):
    if job_detail.start_time_secs is None:
        return None
    wallclock_secs = time.time() - job_detail.start_time_secs
    if wallclock_secs < wallclock_threshold_minutes * 60:
        # job has not been running long enough
        return None
    elif job_detail.cpu_secs is None:
        logging.log_message("Could not get cpu time for ",
                            job_detail.job_and_task)
        return 0
    else:
        fraction = job_detail.cpu_secs / float(wallclock_secs)
        if fraction <= cpu_fraction:
            return fraction
        else:
            return None
Esempio n. 13
0
def parse_running_job_resource_info(job_xml_tree):
    """
    Get info about all running jobs, including resource info (-r) and project (-ext)
    :param job_xml_tree: output of qstat -xml -j ...
    :return: list of JobResourceInfo
    """
    ancestor_finder = uge_functions.XmlAncestorFinder(job_xml_tree)
    lst_ret = []
    for task_number in job_xml_tree.findall("*//JAT_task_number"):
        task = ancestor_finder.get_ancestor(task_number)
        job = ancestor_finder.get_ancestor(task, 2)
        job_id = job.findtext('JB_job_number')
        task_id = task_number.text
        if task.find("JAT_granted_destin_identifier_list") is None:
            logging.log_message("Strange XML for running job (%s) task (%s)" %
                                (job_id, task_id))
            continue
        slots_str = task.findtext('*//JG_slots')
        slots = int(slots_str)
        h_vmem_node = job.find(
            "JB_hard_resource_list/element/[CE_name='h_vmem']")
        if h_vmem_node is None:
            memory = default_h_vmem
        else:
            memory = h_vmem_node.findtext("CE_stringval")
        h_rt_node = job.find("JB_hard_resource_list/element/[CE_name='h_rt']")
        h_rt = parse_h_rt(h_rt_node.findtext(
            "CE_stringval")) if h_rt_node is not None else None
        queue = job.findtext('*//QR_name', default_queue)
        lst_ret.append(
            JobResourceInfo(
                job_and_task=uge_functions.JobAndTask(job_id=job_id,
                                                      task_id=task_id),
                project=job.findtext('JB_project'),
                slots=slots,
                queue=queue,
                memory=slots * uge_functions.dehumanize_memory(memory),
                user=job.findtext('JB_owner'),
                host=task.findtext("*//JG_qhostname"),
                state="running",
                submit_time=float(job.findtext('JB_submission_time')) / 1000,
                start_time=float(task.findtext('JAT_start_time')) / 1000,
                priority=None,
                tickets=None,
                h_rt=h_rt))
    return lst_ret
Esempio n. 14
0
def read_devices():
    '''Read Devices from FILE_DEVICES and return a list with hostnames
    Checks implemented:
        no duplicates
    '''
    log_message(read_devices.__name__, 'device reading started')
    devices = []
    devices_read = read_lines_from_file(FILE_DEVICES)
    #check for duplicates
    if not len(devices_read) == len(set(devices_read)):
        interrupt_execution_with_errors(read_devices.__name__,
                                        'duplicated hostnames found')
    for hostname in devices_read:
        device = Device(hostname)
        devices.append(device)
    log_message(read_devices.__name__, 'device reading completed')
    return devices
def generate_ip_address_config(devices):
    '''Generates interface configuration for each file'''
    log_message(generate_ip_address_config.__name__,
                "generate ip address config started")
    for device in devices:
        interfaces = device.interfaces_phy + device.interfaces_lo
        ip_config_lines = [
            f"set interfaces {interface.name} unit 0 family inet address {interface.ip_address.with_prefixlen}"
            for interface in interfaces
        ]
        desc_config_lines = [
            f"set interfaces {interface.name} description {interface.description}"
            for interface in interfaces
        ]
        config_lines = ip_config_lines + desc_config_lines
        write_device_config_to_file(device.hostname, config_lines)
    log_message(generate_ip_address_config.__name__,
                "generate ip address config finished")
Esempio n. 16
0
def count_starved_jobs_by_host(dct_strange_state_jobs,
                               job_per_host_verbosity_threshold=None,
                               host_per_job_verbosity_threshold=None):
    # Count the unique jobs on a host that are cpu-starved.
    # Multiple tasks for the same job on the same host count as 1.
    # The idea is that a job could look cpu-starved if it is waiting for something, e.g. downloading something
    # over the network.  If a host is in trouble, there should be multiple jobs that appear cpu-starved.
    # Also, count the number of hosts on which one of the tasks for a job is cpu-starved.  If there are several,
    # it is probably the job and not a host in trouble.
    # returns tuple(dict(host=>count of unique starved jobs), dict(job_id=>count of unique hosts on which a task for that job is starved)

    dct_host_starved_jobs = {
    }  # key: host; value: set of job IDs (not job_and_task)
    dct_starved_job_hosts = {
    }  # key: job ID, value: set of hosts on which that job has a starved task
    for job_and_task, problem_state in dct_strange_state_jobs.items():
        if problem_state.state.startswith("cpu-starved"):
            dct_host_starved_jobs.setdefault(problem_state.host,
                                             set()).add(job_and_task.job_id)
            dct_starved_job_hosts.setdefault(job_and_task.job_id,
                                             set()).add(problem_state.host)

    if job_per_host_verbosity_threshold is not None:
        for host, jobs in dct_host_starved_jobs.iteritems():
            if len(jobs) >= job_per_host_verbosity_threshold:
                logging.log_message(host + " has cpu-starved jobs " +
                                    ", ".join(jobs))

    if host_per_job_verbosity_threshold is not None:
        for job, hosts in dct_starved_job_hosts.iteritems():
            if len(hosts) >= host_per_job_verbosity_threshold:
                logging.log_message(job + " is cpu-starved on hosts " +
                                    ", ".join(hosts))

    return (dict([(host, len(jobs))
                  for host, jobs in dct_host_starved_jobs.items()]),
            dict([(job_id, len(hosts))
                  for job_id, hosts in dct_starved_job_hosts.items()]))
Esempio n. 17
0
def sigint_handler(signum, frame):
    """Log termination cleanly"""
    logging.log_message(
        "Exiting after received signal %d\n%s" % (signum, "".join(
            traceback.format_list(traceback.extract_stack(frame)))))
    sys.exit()
Esempio n. 18
0
def main(args=None):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        "--verbose",
        "-v",
        action="count",
        default=0,
        help=
        "Write progress to stderr. Use this option multiple times for more verbosity"
    )
    parser.add_argument(
        "--job",
        "-j",
        action="append",
        help="Job ID(s) for which insight into pending state is desired")
    parser.add_argument(
        "--pending-detail",
        action="store_true",
        default=False,
        help="Print detail about jobs that have jumped in from of query job")
    parser.add_argument(
        "--ignore-job",
        "-i",
        action="append",
        help="Job ID(s) to ignore when -j is specified.  "
        "This is sometimes necessary to work around bad UGE XML for some jobs",
        default=[])
    parser.add_argument(
        "--ignore-project",
        "-p",
        action="append",
        help="Projects to ignore when finding all extant shares.  "
        "This is sometimes necessary because of unused projects",
        default=[])
    parser.add_argument("--ignore-queue",
                        "-q",
                        action="append",
                        help="Queues to ignore when counting healthy hosts.  "
                        "This is sometimes necessary because of unused queues",
                        default=[])
    parser.add_argument(
        "--memory-users",
        "-m",
        type=int,
        default=10,
        help=
        "Print user stats for the top N users of memory.  Default: %(default)s"
    )
    parser.add_argument(
        "--slot-users",
        "-s",
        type=int,
        default=10,
        help=
        "Print user stats for the top N users of slots.  Default: %(default)s")
    parser.add_argument(
        "--mem-histogram-binsize",
        type=float,
        default=10 * 1024 * 1024 * 1024.0,
        help="Bin size for available memory histogram.  Default: %(default)s")
    parser.add_argument("--nocolor",
                        default=False,
                        action='store_true',
                        help="Do not use ANSI colors in output")
    parser.add_argument(
        "--write-snapshot",
        "-w",
        help="Write a tar file containing the output of the various Grid Engine "
        "commands executed")
    parser.add_argument(
        "--read-snapshot",
        "-r",
        help=
        "Instead of executing Grid Engine commands to obtain current state, "
        "read snapshot tar file containing a past state")
    options = parser.parse_args(args)

    if options.read_snapshot is not None:
        logging.log_message("Loading Grid Engine snapshot from " +
                            options.read_snapshot,
                            level=1,
                            verbosity=options.verbose)
        uge_state = uge_functions.uge_snapshot(
            snapshot_tar=options.read_snapshot)
    else:
        logging.log_message("Taking snapshot of current Grid Engine state",
                            level=1,
                            verbosity=options.verbose)
        uge_state = uge_functions.uge_snapshot()

    if options.write_snapshot is not None:
        logging.log_message("Writing Grid Engine snapshot to " +
                            options.write_snapshot,
                            level=1,
                            verbosity=options.verbose)
        uge_state.save(options.write_snapshot)

    logging.log_message("Getting host info",
                        level=1,
                        verbosity=options.verbose)
    dct_hosts = uge_functions.parse_host_info(
        qhost_tree=uge_state.get_xml(uge_state.uge_snapshot_hosts),
        queues_tree=uge_state.get_xml(uge_state.uge_snapshot_queues),
        queues_to_ignore=options.ignore_queue)
    lst_hosts = dct_hosts.values()

    logging.log_message("Getting job info", level=1, verbosity=options.verbose)
    running_jobs = parse_running_job_resource_info(
        uge_state.get_xml(uge_state.uge_snapshot_running_jobs))

    all_hosts_report_line = make_report_line("all hosts", lst_hosts)
    lst_report = [all_hosts_report_line]
    lst_report.extend(
        group_and_report(
            lst_hosts,
            key_fun=lambda _host: _host.state,
            key_prefix="hosts in state ",
            key_filter=lambda state: state is not None and len(state) > 0))

    logging.log_message("Getting project info",
                        level=1,
                        verbosity=options.verbose)
    projects = [
        uge_state.get_project_info(project)
        for project in uge_state.get_projects()
        if project not in options.ignore_project
    ]
    total_shares = sum([int(project.fshare) for project in projects])
    dct_shares_by_project = dict([(project.name, project.fshare)
                                  for project in projects])

    lst_report.append(
        make_report_line("available resources on healthy hosts", [
            host._replace(slots=host.slots - host.slots_used,
                          memory=host.memory - host.mem_used)
            for host in lst_hosts if host.state is None
            and host.slots_used is not None and host.mem_used is not None
        ]))

    lst_report.extend(
        report_by_queue_project_and_user(
            running_jobs,
            "running jobs",
            dct_shares_by_project,
            top_n_memory_users=options.memory_users,
            top_n_slot_users=options.slot_users))

    # Add the percentages
    lst_report = [
        report._replace(
            pct_slots=int(report.slots * 100 / all_hosts_report_line.slots),
            pct_memory=int(report.memory * 100 / all_hosts_report_line.memory),
            pct_shares=None if report.shares is None else int(
                int(report.shares) * 100 / total_shares))
        for report in lst_report
    ]
    # Figure out which project(s) have gotten more slots or RAM than their percentage of shares
    # Leading Falses corresponds to the 2 header lines
    lst_hog = [False, False] + \
              [report.pct_shares is not None and (
                      report.pct_memory > report.pct_shares or report.pct_slots > report.pct_shares) for report in
               lst_report]

    print "# RESOURCES ALLOCATED TO RUNNING JOBS\n"
    lst_rows = [('class', 'slots', 'memory', 'pct_slots', 'pct_memory', 'pct_shares')] + \
               [(report.label, str(report.slots), uge_functions.humanize_memory(report.memory),
                 str(report.pct_slots) + '%',
                 str(report.pct_memory) + '%',
                 '' if report.pct_shares is None else str(report.pct_shares) + '%') for report in lst_report]

    if options.nocolor:
        clear_ansi_color = ''
        start_ansi_red = ''
    else:
        clear_ansi_color = '\033[0m'
        start_ansi_red = '\033[31m'

    for line, do_color in zip(
            tabularize.format_table(
                lst_rows, right_justify=[False, True, True, True, True, True]),
            lst_hog):
        if do_color:
            print start_ansi_red + line + clear_ansi_color
        else:
            print line

    print "\n# SHARES BY PROJECT\n"
    lst_rows = [('project', 'shares', 'pct_shares')] + \
               [(project.name, str(project.fshare), str(int(int(project.fshare) * 100 / total_shares)) + '%')
                for project in projects]
    for line in tabularize.format_table(lst_rows,
                                        right_justify=[False, True, True]):
        print line

    print "\n# HISTOGRAM OF AVAILABLE SLOTS\n"
    slots_histogram = histogram_tools.make_histogram([
        host.slots - host.slots_used for host in lst_hosts
        if host.slots_used is not None and host.state is None
    ], 1)
    lst_rows = [("available_slots", "num_hosts", "num_hosts >= this bin")] + \
               [(str(slots), str(hosts), str(hosts_ge)) for slots, hosts, hosts_ge in slots_histogram]
    for line in tabularize.format_table(lst_rows,
                                        right_justify=[True, True, True],
                                        column_sep="   "):
        print line

    print "\n# HISTOGRAM OF AVAILABLE MEMORY\n"
    slots_histogram = histogram_tools.make_histogram([
        host.memory - host.mem_used for host in lst_hosts
        if host.mem_used is not None and host.state is None
    ], options.mem_histogram_binsize)
    lst_rows = [("available_memory", "num_hosts", "num_hosts >= this bin")] + \
               [(uge_functions.humanize_memory(mem, precision=0) + " <= mem < " +
                 uge_functions.humanize_memory(mem + options.mem_histogram_binsize, precision=0),
                 str(hosts), str(hosts_ge))
                for mem, hosts, hosts_ge in slots_histogram]
    for line in tabularize.format_table(lst_rows,
                                        right_justify=[True, True, True],
                                        column_sep="   "):
        print line

    if options.job is not None and len(options.job) > 0:
        logging.log_message("Getting start times of running jobs",
                            level=1,
                            verbosity=options.verbose)
        pending_jobs_resource_info = get_pending_jobs_resource_info(
            uge_state.get_xml(uge_state.uge_snapshot_jobs))
        for pending_job in options.job:
            resource_info_list = filter(
                lambda _job: _job.job_and_task.job_id == pending_job,
                pending_jobs_resource_info)
            if len(resource_info_list) == 0:
                print "\n# %s is not a pending job" % pending_job
                continue
            resource_info = resource_info_list[0]
            pending_job_submit_time = resource_info.submit_time
            queue_jumper_resource_infos = [
                job_info for job_info in running_jobs
                if job_info.submit_time > pending_job_submit_time
            ]
            lst_report = report_by_queue_project_and_user(
                queue_jumper_resource_infos,
                "queue jumpers",
                dct_shares_by_project,
                top_n_memory_users=options.memory_users,
                top_n_slot_users=options.slot_users)
            print "\n# QUEUE JUMPERS for job %s. user %s, queue %s, project %s, slots %d, memory %s, tickets %s, h_rt %s\n" % (
                resource_info.job_and_task.job_id, resource_info.user,
                resource_info.queue, resource_info.project,
                resource_info.slots,
                uge_functions.humanize_memory(
                    resource_info.memory), resource_info.tickets,
                uge_functions.humanize_seconds(resource_info.h_rt))
            lst_rows = [('class', 'slots', 'memory', 'pct_slots', 'pct_memory')] + \
                       [(report.label, str(report.slots), uge_functions.humanize_memory(report.memory),
                         str(int(report.slots * 100 / all_hosts_report_line.slots)) + '%',
                         str(int(report.memory * 100 / all_hosts_report_line.memory)) + '%') for report in lst_report]
            for line in tabularize.format_table(
                    lst_rows, right_justify=[False, True, True, True, True]):
                print line

            print "\n# HISTOGRAM OF SLOT RESERVATIONS FOR QUEUE JUMPERS for job %s\n" % pending_job
            queue_jumper_slots_histo = histogram_tools.make_histogram(
                [job_info.slots for job_info in queue_jumper_resource_infos],
                1)
            lst_rows = [("slots_used", "num_queue_jumper_jobs", "num_queue_jumper_jobs >= this bin")] + \
                       [(str(slots), str(jobs), str(jobs_ge)) for slots, jobs, jobs_ge in queue_jumper_slots_histo]
            for line in tabularize.format_table(
                    lst_rows, right_justify=[True, True,
                                             True], column_sep="   "):
                print line

            lst_available_hosts = [
                (host.host, host.slots - host.slots_used,
                 host.memory - host.mem_used) for host in lst_hosts
                if host.state is None and host.mem_used is not None
                and host.slots - host.slots_used >= resource_info.slots
                and host.memory - host.mem_used >= resource_info.memory
                and resource_info.queue in host.queues
            ]
            # Sort by (available_slots, available_memory)
            lst_available_hosts.sort(key=lambda _tup: _tup[1:])
            # Convert to strings for printing
            lst_available_hosts = [(tup[0], str(tup[1]),
                                    uge_functions.humanize_memory(tup[2]))
                                   for tup in lst_available_hosts]
            print "\n# %d HEALTHY HOSTS WITH AT LEAST %d SLOTS AND %s MEMORY IN %s QUEUE FOR JOB %s\n" % \
                  (len(lst_available_hosts), resource_info.slots, uge_functions.humanize_memory(resource_info.memory),
                   resource_info.queue, pending_job)
            lst_rows = [("host", "available_slots", "available_memory")
                        ] + lst_available_hosts
            for line in tabularize.format_table(
                    lst_rows, right_justify=[False, True,
                                             True], column_sep="   "):
                print line

            # report on pending jobs with more tickets than query job
            lst_high_ticket_pending_jobs = [
                job_info for job_info in pending_jobs_resource_info
                if job_info.tickets > resource_info.tickets
            ]

            print "\n# Pending jobs with more tickets than job " + pending_job
            lst_rows = [('class', 'slots', 'memory')] + \
                       [(report.label, str(report.slots), uge_functions.humanize_memory(report.memory)) for report in
                        (report_by_queue_project_and_user(lst_high_ticket_pending_jobs, "pending jobs",
                                                          dct_shares_by_project))]
            for line in tabularize.format_table(
                    lst_rows, right_justify=[False, True, True]):
                print line

            print "\n# Pending jobs with more tickets than job " + pending_job + " but submitted after"
            lst_high_ticket_pending_jumpers = [
                job_info for job_info in lst_high_ticket_pending_jobs
                if job_info.submit_time > resource_info.submit_time
            ]
            lst_rows = [('class', 'slots', 'memory')] + \
                       [(report.label, str(report.slots), uge_functions.humanize_memory(report.memory)) for report in
                        (report_by_queue_project_and_user(lst_high_ticket_pending_jumpers,
                                                          "pending high-ticket queue jumpers", dct_shares_by_project))]
            for line in tabularize.format_table(
                    lst_rows, right_justify=[False, True, True]):
                print line

            if options.pending_detail:
                # get tickets for running jobs
                queue_jumper_resource_infos = replace_tickets(
                    queue_jumper_resource_infos,
                    uge_state.get_xml(uge_state.uge_snapshot_jobs))
                # Both these tables are in descending order by tickets
                queue_jumper_resource_infos.sort(
                    cmp=lambda x, y: cmp(y.tickets, x.tickets))
                lst_high_ticket_pending_jumpers.sort(
                    cmp=lambda x, y: cmp(y.tickets, x.tickets))
                lst_header = [
                    'job', 'task', 'user', 'project', 'slots', 'memory',
                    'tickets', 'h_rt', 'submitted', 'dispatched'
                ]
                print "\n# Running queue jumper details"
                lst_rows = [lst_header] + [
                    (job.job_and_task.job_id, job.job_and_task.task_id,
                     job.user, job.project, str(job.slots),
                     uge_functions.humanize_memory(job.memory), str(
                         job.tickets), uge_functions.humanize_seconds(
                             job.h_rt), format_time(
                                 job.submit_time), format_time(job.start_time))
                    for job in queue_jumper_resource_infos
                ]
                for line in tabularize.format_table(lst_rows,
                                                    right_justify=[
                                                        False, True, False,
                                                        False, True, True,
                                                        True, True, False,
                                                        False
                                                    ]):
                    print line

                lst_header = [
                    'job', 'user', 'project', 'slots', 'memory', 'tickets',
                    'h_rt', 'submitted'
                ]
                print "\n# Pending queue jumper details"
                lst_rows = [lst_header] + [
                    (job.job_and_task.job_id, job.user, job.project,
                     str(job.slots), uge_functions.humanize_memory(job.memory),
                     str(job.tickets), uge_functions.humanize_seconds(
                         job.h_rt), format_time(job.submit_time))
                    for job in lst_high_ticket_pending_jumpers
                ]
                for line in tabularize.format_table(lst_rows,
                                                    right_justify=[
                                                        False, False, False,
                                                        True, True, True, True,
                                                        False
                                                    ]):
                    print line
Esempio n. 19
0
def parse_host_info(qhost_tree, queues_tree, queues_to_ignore=[]):
    """
    :return: dictionary key: host, value HostInfo
    """
    dctRet = {}
    for host_node in qhost_tree.findall('host'):
        host_name = host_node.get('name')
        dct_hostvalues = dict([
            (hostvalue_node.get('name'), hostvalue_node.text)
            for hostvalue_node in host_node.findall('hostvalue')
        ])
        if dct_hostvalues['num_proc'] != '-':
            slots = int(dct_hostvalues['num_proc'])
            slots_used = sum([
                int(slots_used_node.text) for slots_used_node in
                host_node.findall(".//queuevalue[@name='slots_used']")
            ])
            memory = dehumanize_memory(dct_hostvalues['mem_total'])
            mem_used = 0 if dct_hostvalues[
                'mem_used'] == '-' else dehumanize_memory(
                    dct_hostvalues['mem_used'])
            dctRet[host_name] = HostInfo(host=host_name,
                                         slots=slots,
                                         memory=memory,
                                         state=None,
                                         slots_used=slots_used,
                                         mem_used=mem_used,
                                         queues=set())
        else:
            dctRet[host_name] = HostInfo(host=host_name,
                                         slots=None,
                                         memory=None,
                                         state=None,
                                         slots_used=None,
                                         mem_used=None,
                                         queues=set())
    for queue_info in queues_tree.findall('*/Queue-List'):
        state = queue_info.findtext('state')
        if state is None: state = ''
        # Ignore suspended state
        state = re.sub('s', '', state)
        # Ignore configuration ambiguous state
        state = re.sub('c', '', state)
        # If disabled, ignore other state flags, because they can vary between queues on a host
        if 'd' in state:
            state = 'd'

        queue = queue_info.findtext('name')
        queue_split = queue.split('@', 1)
        host = queue_split[1]
        queue_name = queue_split[0]
        if queue_name in queues_to_ignore:
            continue
        host_info = dctRet.get(host)
        host_info.queues.add(queue_name)

        if len(state) > 0:
            if host_info is None:
                logging.log_message(host + " found in qstat but not qhost")
            elif host_info.state is None:
                dctRet[host] = host_info._replace(state=state)
            elif not is_host_state_compatible(host_info.state, state):
                raise Exception("Conflicting states for %s: %s != %s" %
                                (host, host_info.state, state))

    return dctRet
Esempio n. 20
0
def main(args=None):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        "--email",
        "-e",
        action="append",
        help=
        "email address(es) to which report should be sent.  Default: write report to stdout"
    )
    parser.add_argument(
        "--user",
        "-u",
        action="append",
        help="User(s) whose jobs should be checked.  Default: all users")
    parser.add_argument(
        "--user-file",
        type=read_list_file,
        help=
        "File containing list of users whose jobs should be checked, one per line."
    )
    parser.add_argument("--exclude-user",
                        action="append",
                        help="User(s) whose jobs should not be checked.")
    parser.add_argument(
        "--exclude-user-file",
        type=read_list_file,
        help=
        "File containing list of users whose jobs should not be checked, one per line."
    )
    parser.add_argument("--ignore-job",
                        "-j",
                        action="append",
                        help="Job ID(s) that should be ignored")
    parser.add_argument(
        "--ignore-job-file",
        type=read_list_file,
        help=
        "File containing list of job ID(s) that should be ignored, one per line"
    )
    parser.add_argument("--ignore-host",
                        action="append",
                        help="Host(s) that should be ignored")
    parser.add_argument(
        "--ignore-host-file",
        type=read_list_file,
        help=
        "File containing list of host(s) that should be ignored, one per line")
    parser.add_argument(
        "--sleep-mins",
        "-s",
        type=float,
        default=60,
        help=
        "Time (in minutes) to sleep between status checks.  Default: %(default)s"
    )
    parser.add_argument(
        "--heartbeat-mins",
        type=float,
        default=24 * 60,
        help=
        "Time (in minutes) to sleep between sending message even if no problems.  "
        "Default: %(default)s")
    parser.add_argument(
        "--delete-mins",
        "-d",
        type=float,
        default=30,
        help=
        "A job in deletion state for more than this time (in minutes) is treated as a problem.  "
        "Default: %(default)s")
    parser.add_argument(
        "--transfer-mins",
        "-t",
        type=float,
        default=30,
        help=
        "A job in transfer state for more than this time (in minutes) is treated as a problem.  "
        "Default: %(default)s")
    parser.add_argument(
        "--cpu_fraction",
        "-c",
        type=float,
        default=0.0001,
        help=
        "A job with cpu time/wallclock time < this value is considered cpu-starved.  "
        "Default: %(default)s")
    parser.add_argument(
        "--starved-jobs-per-host",
        type=int,
        default=2,
        help=
        "A cpu-starved job is not reported unless there are at least this many cpu-starved jobs"
        " on the host.  Multiple task for the same job are counted as one.  Default: %(default)s"
    )
    parser.add_argument(
        "--starved-task-array-hosts",
        type=int,
        default=4,
        help=
        "If a task array has cpu-starved jobs on at least this many hosts, it is assumed that "
        "the job is waiting, rather than that the host is starving it of cpu.  Default: %(default)s"
    )
    parser.add_argument(
        "--wallclock_threshold",
        "-w",
        type=float,
        default=3,
        help=
        "A job is not considered cpu-starved until its wallclock time (minutes) is >= this value. "
        "Default: %(default)s")
    parser.add_argument(
        "--cpu-hog",
        type=float,
        default=2.0,
        help=
        "A job with (cpu time/wallclock time) - slots > this value is considered a cpu hog"
        "Default: %(default)s")
    parser.add_argument(
        "--verbose",
        "-v",
        action="count",
        default=0,
        help=
        "Write progress to stderr. Use this option multiple times for more verbosity"
    )
    parser.add_argument(
        "--smtp",
        default="smtp.broadinstitute.org",
        help="SMTP host for sending mail. Default: %(default)s")
    parser.add_argument(
        "--name",
        "-n",
        default="UGER",
        help=
        "Name of batch system to be used in messages.  Default: %(default)s")

    # key: host; value: time host failed ping
    dct_no_ping_hosts = {}

    # key: host; value: time bad state first seen
    dct_unreachable_queue_hosts = {}

    # key: JobAndTask; value: ProblemState(state, time state first seen, time reported)
    dct_strange_state_jobs = {}
    last_summary_time = time.time()

    signal.signal(signal.SIGINT, sigint_handler)

    options = parser.parse_args(args)
    # Do not send more than one exception message every 30 minutes, to avoid email avalanche
    mail_throttler = email_notify.MailThrottler(
        smtp=options.smtp,
        recipients=options.email,
        throttle=30 * 60,
        subject="UGER monitor unhandled exception")

    logging.log_message("Starting daemon")
    while True:
        try:
            # parse options again every time so that any files are reloaded
            options = parser.parse_args(args)
            logging.log_message("Options: " + str(options), 1, options.verbose)
            options.user = combine_option_lists(options.user,
                                                options.user_file, ["*"])
            excluded_users = frozenset(
                combine_option_lists(options.exclude_user,
                                     options.exclude_user_file))
            ignored_jobs = frozenset(
                combine_option_lists(options.ignore_job,
                                     options.ignore_job_file))
            ignored_hosts = frozenset(
                combine_option_lists(options.ignore_host,
                                     options.ignore_host_file))

            logging.log_message("Getting job list", 1, options.verbose)
            # remove jobs to be ignored
            jobs = [
                job for job in uge_functions.get_job_list(options.user)
                if job.user not in excluded_users and job.host is not None
                and job.host not in ignored_hosts
                and job.job_and_task.job_id not in ignored_jobs
            ]

            logging.log_message("Clearing strange jobs that are gone", 1,
                                options.verbose)
            # Remove strange jobs that were not returned in job list
            current_job_set = set([job.job_and_task for job in jobs])
            for strange_job in dct_strange_state_jobs.keys():
                if strange_job not in current_job_set:
                    remove_strange_job(dct_strange_state_jobs, strange_job)

            # key: host; value: list of problem strings
            dct_problems = {}

            logging.log_message("Pinging hosts", 1, options.verbose)
            for newly_failed_host in ping_hosts(
                    set(dct_no_ping_hosts.keys() + [job.host for job in jobs]),
                    dct_no_ping_hosts):
                dct_problems.setdefault(newly_failed_host,
                                        []).append("did not respond to ping")

            logging.log_message("Checking queue states", 1, options.verbose)
            for newly_failed_host in check_host_queue_state(
                    set(dct_unreachable_queue_hosts.keys() +
                        [job.host for job in jobs]),
                    dct_unreachable_queue_hosts):
                dct_problems.setdefault(
                    newly_failed_host,
                    []).append("has queue in unreachable state")

            running_jobs = []
            logging.log_message("Checking job state", 1, options.verbose)
            transfer_state = "transfer"
            deletion_state = "deletion"
            for job in jobs:
                if is_transfer_state(job.state):
                    update_strange_jobs(dct_strange_state_jobs, job,
                                        transfer_state)
                elif is_delete_state(job.state):
                    update_strange_jobs(dct_strange_state_jobs, job,
                                        deletion_state)
                elif is_running_state(job.state):
                    running_jobs.append(job)
                    problem_state = dct_strange_state_jobs.get(
                        job.job_and_task)
                    if problem_state is not None and \
                        (problem_state.state == transfer_state or problem_state.state == deletion_state):
                        logging.log_message(
                            "Job %s leaving strange state %s and now running" %
                            (str(job),
                             dct_strange_state_jobs[job.job_and_task]), 1,
                            options.verbose)
                        remove_strange_job(dct_strange_state_jobs,
                                           job.job_and_task)
                elif job.job_and_task in dct_strange_state_jobs:
                    # job not in running state anymore
                    logging.log_message("Odd job state " + str(job), 1,
                                        options.verbose)
                    remove_strange_job(dct_strange_state_jobs,
                                       job.job_and_task)

            logging.log_message("Checking cpu usage", 1, options.verbose)
            # Group by requires input is sorted in order to be grouped
            running_jobs.sort(cmp=lambda job1, job2: cmp(
                (job1.job_and_task.job_id, job1.job_and_task.task_id),
                (job2.job_and_task.job_id, job2.job_and_task.task_id)))
            for ignore_key, job_group in itertools.groupby(
                    running_jobs, key=lambda j: j.job_and_task.job_id):
                # convert from iterable to list, because the first element is accessed twice
                jobs_in_group = list(job_group)
                if jobs_in_group[0].queue == "interactive":
                    # Interactive jobs can appear cpu-starved
                    continue
                job_id = jobs_in_group[0].job_and_task.job_id
                logging.log_message("Getting details for " + str(job_id), 2,
                                    options.verbose)
                dct_job_details = uge_functions.get_job_detail(job_id)
                if dct_job_details is None:
                    # all task for job appear to be gone
                    for job in jobs_in_group:
                        remove_strange_job(dct_strange_state_jobs,
                                           job.job_and_task)
                else:
                    for job in jobs_in_group:
                        job_details = dct_job_details.get(
                            job.job_and_task.task_id)
                        if job_details is None:
                            # this job and task is done
                            remove_strange_job(dct_strange_state_jobs,
                                               job.job_and_task)
                        else:
                            cpu_fraction = cpu_starved(
                                job_details, options.wallclock_threshold,
                                options.cpu_fraction)
                            if cpu_fraction is not None:
                                update_strange_jobs(
                                    dct_strange_state_jobs, job,
                                    "cpu-starved %f" % cpu_fraction,
                                    job_details)
                            else:
                                cpu_hog_amount = cpu_hog(
                                    job_details, options.cpu_hog)
                                if cpu_hog_amount is not None:
                                    update_strange_jobs(
                                        dct_strange_state_jobs, job,
                                        "cpu-hog %f" % cpu_hog_amount,
                                        job_details)

            report_time_threshold = time.time() - 60 * options.sleep_mins
            deletion_time_threshold = time.time() - 60 * options.delete_mins
            transfer_time_threshold = time.time() - 60 * options.transfer_mins
            dct_count_starved_jobs_by_host, dct_starved_job_host_count = \
                count_starved_jobs_by_host(dct_strange_state_jobs,
                                           options.starved_jobs_per_host if options.verbose > 1 else None,
                                           options.starved_task_array_hosts if options.verbose > 1 else None)

            for job_and_task, problem_state in dct_strange_state_jobs.items():
                if problem_state.reported_time is None:
                    if problem_state.state == deletion_state:
                        report_problem = problem_state.first_seen_time <= deletion_time_threshold
                    elif problem_state.state == transfer_state:
                        report_problem = problem_state.first_seen_time <= transfer_time_threshold
                    elif problem_state.state.startswith("cpu-starved"):
                        report_problem = problem_state.first_seen_time <= report_time_threshold and \
                            dct_count_starved_jobs_by_host[problem_state.host] >= options.starved_jobs_per_host and \
                            dct_starved_job_host_count[job_and_task.job_id] < options.starved_task_array_hosts
                    else:
                        report_problem = problem_state.first_seen_time <= report_time_threshold
                    if report_problem:
                        dct_problems.setdefault(problem_state.host, []).append(
                            format_job_problem(job_and_task, problem_state))
                        dct_strange_state_jobs[
                            job_and_task] = problem_state._replace(
                                reported_time=time.time())

            # Generate message for all problems even when only reporting new problems.
            do_heartbeat = time.time(
            ) - last_summary_time >= 60 * options.heartbeat_mins
            if len(dct_problems) > 0 or do_heartbeat:
                last_summary_time = time.time()
                all_problems = {}
                for host in dct_no_ping_hosts.keys():
                    all_problems.setdefault(
                        host, []).append("did not respond to ping")
                for host in dct_unreachable_queue_hosts.keys():
                    all_problems.setdefault(
                        host, []).append("has queue in unreachable state")
                for job_and_task, problem_state in dct_strange_state_jobs.items(
                ):
                    if problem_state.reported_time is not None:
                        all_problems.setdefault(problem_state.host, []). \
                            append(format_job_problem(job_and_task, problem_state))

            if len(dct_problems) > 0:
                message = "The following problems have been detected since the last check:\n\n" + \
                          format_problems(dct_problems) + \
                          "\n\nAll outstanding problems:\n\n" + format_problems(all_problems)
                logging.log_message(message)
                if options.email is not None:
                    email_notify.email_report(
                        message, "New " + options.name + " problems",
                        options.email, options.smtp)

            if do_heartbeat:
                message = "Previously reported problems that have not cleared:\n\n" + format_problems(
                    all_problems)
                logging.log_message(message)
                if options.email is not None:
                    email_notify.email_report(
                        message, "All " + options.name + " problems",
                        options.email, options.smtp)

            logging.log_message("Sleeping for %s minutes" % options.sleep_mins,
                                1, options.verbose)
            time.sleep(options.sleep_mins * 60)
        except Exception, e:
            exc_str = traceback.format_exc()
            logging.log_message("Looping after unhandled exception: " +
                                exc_str)
            mail_throttler.add_message(exc_str)
Esempio n. 21
0
def write_system_config_to_file(system_config_file, config):
    log_message(write_lines_to_file.__name__, 'write system config started')
    filename = "{}{}.cfg".format(OUTPUT_SYSTEM_CONFIG, system_config_file)
    write_lines_to_file(filename, config)
    log_message(write_lines_to_file.__name__, 'write system config completed')
Esempio n. 22
0
def write_device_config_to_file(device, config):
    log_message(write_lines_to_file.__name__, 'write device config started')
    filename = "{}{}.cfg".format(OUTPUT_DEVICE_CONFIG, device)
    write_lines_to_file(filename, config)
    log_message(write_lines_to_file.__name__, 'write device config completed')
def main(args=None):
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--job-duration",
        "-j",
        type=float,
        default=60.0,
        help=
        "Count jobs that die within this many seconds.  Default: %(default)s")
    parser.add_argument("--num-failed-jobs",
                        "-n",
                        type=int,
                        default=5,
                        help="Report if at least this many distinct jobs "
                        "(as opposed to multiple tasks for the same job)"
                        " die on the same host.  Default: %(default)s")
    parser.add_argument(
        "--interval",
        "-i",
        type=float,
        default=300.0,
        help=
        "Report if enough jobs failed on the same host within this many seconds.  "
        "Default: %(default)s")
    if full_accounting_path is None:
        parser.add_argument(
            "--accounting",
            "-a",
            required=True,
            help="Grid Engine accounting file to monitor.  "
            "This is required if SGE_ROOT and SGE_CELL environment variables are not set.  "
            "Load a Grid Engine dotkit to set these environment variables.")
    else:
        parser.add_argument(
            "--accounting",
            "-a",
            default=full_accounting_path,
            help="Grid Engine accounting file to monitor.  Default: %(default)s"
        )

    parser.add_argument(
        "--email",
        "-e",
        action="append",
        help=
        "email address(es) to which report should be sent.  Default: only write report to stdout"
    )
    parser.add_argument(
        "--smtp",
        default="smtp.broadinstitute.org",
        help="SMTP host for sending mail. Default: %(default)s")
    parser.add_argument(
        "--email-interval",
        type=int,
        default=300,
        help=
        "Send an email message no more frequently than this many seconds, in order to avoid "
        "swamping mail server.  Default: %(default)s")
    parser.add_argument(
        "--heartbeat-interval",
        type=int,
        default=60 * 60 * 24,
        help=
        "Send an email message after this many seconds, to make sure the daemon is alive.  "
        "Default: %(default)s")
    parser.add_argument(
        "--verbose",
        "-v",
        action="count",
        default=0,
        help=
        "Write progress to stderr. Use this option multiple times for more verbosity"
    )
    options = parser.parse_args(args)
    logging.log_message(str(options))

    mail_throttler = email_notify.MailThrottler(
        smtp=options.smtp,
        recipients=options.email,
        throttle=options.email_interval,
        subject="UGER black hole hosts")

    last_heartbeat = time.time()

    # bufsize=-1 => 'fully buffered'
    proc = subprocess.Popen(['tail', '-F', options.accounting],
                            stdout=subprocess.PIPE,
                            bufsize=-1)
    # Skip first line which may be truncated
    proc.stdout.readline()
    dct_by_host = dict()
    for job in generate_accounting_lines(proc.stdout):
        now = time.time()
        if now - last_heartbeat >= options.heartbeat_interval:
            email_notify.email_report(
                message="Grid Engine black hole detector is alive",
                subject="Grid Engine black hole heartbeat",
                recipients=options.email,
                smtp_host=options.smtp)
            logging.log_message("Grid Engine black hole detector is alive")
            last_heartbeat = now

        mail_throttler.maybe_send()
        logging.log_message("Terminated job: " + str(job),
                            level=3,
                            verbosity=options.verbose)
        if job.exit_status >= 128 and job.duration <= options.job_duration and job.deleted_by is None:
            b_potential_black_hole = False
            logging.log_message("Failed job " + str(job),
                                level=2,
                                verbosity=options.verbose)
            update_dict(dct_by_host,
                        job,
                        age_out_interval=options.interval * 2)
            for host in dct_by_host.iterkeys():
                dct_host = dct_by_host[host]
                if len(dct_host) >= options.num_failed_jobs:
                    b_potential_black_hole = True
                    logging.log_message("potential black hole " + host,
                                        level=1,
                                        verbosity=options.verbose)
                    sorted_jobs = sorted(dct_host.itervalues(),
                                         key=lambda j: j.end_time)
                    for i in xrange(0,
                                    len(sorted_jobs) -
                                    options.num_failed_jobs):
                        if sorted_jobs[i+options.num_failed_jobs].end_time - sorted_jobs[i].end_time <= \
                                options.interval:
                            black_hole_message = "BLACK HOLE: %s has %d failed jobs" % (
                                host, len(sorted_jobs))
                            logging.log_message(black_hole_message,
                                                file=sys.stdout)
                            logging.log_message(black_hole_message)
                            mail_throttler.add_message(black_hole_message)
                            break
            if b_potential_black_hole:
                logging.log_message("Current failed job state: " +
                                    format_failed_jobs_dict(dct_by_host),
                                    level=2,
                                    verbosity=options.verbose)
Esempio n. 24
0
def read_topology(devices_valid):
    '''Read from FILE_PHY_INTERFACES, updates Device info and return topology
    Check implemented:
        no duplicated endpoint
        no physical loopback (cable connected to same device)
        endpoint using valid device
     '''
    log_message(read_topology.__name__, 'physical interfaces reading started')
    lines = read_lines_from_file(FILE_PHY_INTERFACES)
    devices = []
    endpoints = []
    links = []
    devices_valid_dict = {device.hostname: device for device in devices_valid}

    for line in lines:
        if ':' in line:  #link_id is constructed using ':'
            fields = line.split(',')
            link_id = fields[0]
            network_base_input = fields[1]
            network_prefix_input = fields[2]
            a_and_device_input = fields[3]
            a_end_interface_input = fields[4]
            a_end_host_ip_input = fields[5]
            z_and_device_input = fields[6]
            z_end_interface_input = fields[7]
            z_end_host_ip_input = fields[8]
            a_end_device = Device(a_and_device_input)
            a_and_interface = Interface(
                a_end_interface_input,
                f'{network_base_input}.{a_end_host_ip_input}',
                network_prefix_input)
            z_end_device = Device(z_and_device_input)
            z_and_interface = Interface(
                z_end_interface_input,
                f'{network_base_input}.{z_end_host_ip_input}',
                network_prefix_input)
            a_end_endpoint = Endpoint(a_end_device, a_and_interface)
            z_end_endpoint = Endpoint(z_end_device, z_and_interface)
            devices.append(a_end_device)
            devices.append(z_end_device)
            endpoints.append(a_end_endpoint)
            endpoints.append(z_end_endpoint)
            link = Link(a_end_endpoint, z_end_endpoint)
            try:
                devices_valid_dict[
                    link.endpoint_a_end.device.hostname].interfaces_phy.append(
                        link.endpoint_a_end.interface)
                devices_valid_dict[
                    link.endpoint_z_end.device.hostname].interfaces_phy.append(
                        link.endpoint_z_end.interface)
            except KeyError:
                print(
                    f"{a_end_device.hostname} and/or {z_end_device.hostname} not found in in devices_valid"
                )
            links.append(link)

    if not set(devices).issubset(set(devices_valid)):
        print(f"Invalid devices found:{set(devices) - set(devices_valid)}")

        interrupt_execution_with_errors(read_topology.__name__,
                                        "Invalid device read")

    if not len(endpoints) == len(set(endpoints)):
        interrupt_execution_with_errors(read_topology.__name__,
                                        "Physical Duplicated endpoint found")

    log_message(read_topology.__name__, 'physical interfaces reading finished')

    return links