Beispiel #1
0
cert_name = str(os.getenv('SSL_CLIENT_S_DN_CN'))
cert_no_spaces = cert_name.replace(' ', '_')
if cert_no_spaces == 'None':
    sys.exit(1)

configuration = get_configuration_object()
logger = configuration.logger
logger.info('Resource list GUI: start')

print '''Content-type: text/html

'''

form = cgi.FieldStorage()

print get_cgi_html_header(configuration, 'Grid Resource administration',
                          'Welcome to the Grid resource administration.')

dir_list = os.listdir(configuration.resource_home)
for file in dir_list:
    hosturl = file[0:file.rindex('.')]
    hostidentifier = file[file.rindex('.') + 1:]
    print "     <a href='resource_edit.py?hosturl=" + hosturl\
         + '&hostidentifier=' + hostidentifier\
         + "'>edit</a>&nbsp;&nbsp;<b>" + file + '</b><br />'

print """	<hr>
	    <form action="./resource_edit.py" method="post">
	    <input type="hidden" name="new_resource" value="true" />
	    <input type="submit" name="New" value="New" />
	    </form>
	    """
Beispiel #2
0
def create_monitor(vgrid_name):
    """Write monitor HTML file for vgrid_name"""

    html_file = os.path.join(configuration.vgrid_home, vgrid_name, "%s.html" % configuration.vgrid_monitor)

    print "collecting statistics for VGrid %s" % vgrid_name
    sleep_secs = configuration.sleep_secs
    slackperiod = configuration.slackperiod
    now = time.asctime(time.localtime())

    html_vars = {
        "sleep_secs": sleep_secs,
        "vgrid_name": vgrid_name,
        "logo_url": "/images/logo.jpg",
        "now": now,
        "short_title": configuration.short_title,
    }

    html = get_cgi_html_header(
        configuration,
        "%(short_title)s Monitor, VGrid %(vgrid_name)s" % html_vars,
        "",
        True,
        """<meta http-equiv="refresh" content="%(sleep_secs)s" />
        """
        % html_vars,
        themed_styles(configuration),
        """
<script type="text/javascript" src="/images/js/jquery.js"></script>
<script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script>

<script type="text/javascript" >

$(document).ready(function() {

          // table initially sorted by col. 1 (name)
          var sortOrder = [[1,0]];

          // use image path for sorting if there is any inside
          var imgTitle = function(contents) {
              var key = $(contents).find("a").attr("class");
              if (key == null) {
                  key = $(contents).html();
              }
              return key;
          }
          $("table.monitor").tablesorter({widgets: ["zebra"],
                                          textExtraction: imgTitle,
                                         });
          $("table.monitor").each(function () {
              try {
                  $(this).trigger("sorton", [sortOrder]);
              } catch(err) {
                  /* tablesorter chokes on empty tables - just continue */
              }
          });
     }
);
</script>
        """,
        "",
        False,
    )
    html += (
        """
<!-- end of raw header: this line is used by showvgridmonitor -->
<h1>Statistics/monitor for the %(vgrid_name)s VGrid</h1>
<div class="generatornote smallcontent">
This page was generated %(now)s (automatic refresh every %(sleep_secs)s secs).
</div>
"""
        % html_vars
    )

    # loop and get totals

    parse_count = 0
    queued_count = 0
    frozen_count = 0
    executing_count = 0
    finished_count = 0
    failed_count = 0
    retry_count = 0
    canceled_count = 0

    cpucount_requested = 0
    cpucount_done = 0
    nodecount_requested = 0
    nodecount_done = 0
    cputime_requested = 0
    cputime_done = 0
    used_walltime = 0
    disk_requested = 0
    disk_done = 0
    memory_requested = 0
    memory_done = 0
    runtimeenv_dict = {"": 0}
    runtimeenv_requested = 0
    runtimeenv_done = 0

    number_of_jobs = 0
    up_count = 0
    down_count = 0
    slack_count = 0

    job_assigned = 0
    job_assigned_cpus = 0

    gstat = GridStat(configuration, logger)

    runtimeenv_dict = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT", {})

    parse_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "PARSE")
    queued_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "QUEUED")
    frozen_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FROZEN")
    executing_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXECUTING")
    failed_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FAILED")
    retry_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RETRY")
    canceled_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CANCELED")
    expired_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXPIRED")
    finished_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FINISHED")

    nodecount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_REQ")
    nodecount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_DONE")
    cputime_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_REQ")
    cputime_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_DONE")

    used_walltime = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "USED_WALLTIME")

    if used_walltime == 0:
        used_walltime = datetime.timedelta(0)

    used_walltime = format_timedelta(used_walltime)

    disk_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_REQ")
    disk_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_DONE")
    memory_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_REQ")
    memory_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_DONE")
    cpucount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_REQ")
    cpucount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_DONE")
    runtimeenv_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_REQ")
    runtimeenv_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_DONE")

    number_of_jobs = parse_count
    number_of_jobs += queued_count
    number_of_jobs += frozen_count
    number_of_jobs += expired_count
    number_of_jobs += canceled_count
    number_of_jobs += failed_count
    number_of_jobs += executing_count
    number_of_jobs += finished_count
    number_of_jobs += retry_count

    html_vars = {
        "parse_count": parse_count,
        "queued_count": queued_count,
        "frozen_count": frozen_count,
        "executing_count": executing_count,
        "failed_count": failed_count,
        "retry_count": retry_count,
        "canceled_count": canceled_count,
        "expired_count": expired_count,
        "finished_count": finished_count,
        "number_of_jobs": number_of_jobs,
        "cpucount_requested": cpucount_requested,
        "cpucount_done": cpucount_done,
        "nodecount_requested": nodecount_requested,
        "nodecount_done": nodecount_done,
        "cputime_requested": cputime_requested,
        "cputime_done": cputime_done,
        "used_walltime": used_walltime,
        "disk_requested": disk_requested,
        "disk_done": disk_done,
        "memory_requested": memory_requested,
        "memory_done": memory_done,
        "runtimeenv_requested": runtimeenv_requested,
        "runtimeenv_done": runtimeenv_done,
    }

    html += (
        """<h2>Job Stats</h2><table class=monitorstats><tr><td>
<table class=monitorjobs><tr class=title><td>Job State</td><td>Number of jobs</td></tr>
<tr><td>Parse</td><td>%(parse_count)s</td></tr>
<tr><td>Queued</td><td>%(queued_count)s</td></tr>
<tr><td>Frozen</td><td>%(frozen_count)s</td></tr>
<tr><td>Executing</td><td>%(executing_count)s</td></tr>
<tr><td>Failed</td><td>%(failed_count)s</td></tr>
<tr><td>Retry</td><td>%(retry_count)s</td></tr>
<tr><td>Canceled</td><td>%(canceled_count)s</td></tr>
<tr><td>Expired</td><td>%(expired_count)s</td></tr>
<tr><td>Finished</td><td>%(finished_count)s</td></tr>
<tr><td>Total</td><td>%(number_of_jobs)s</td></tr>
</table>
</td><td>
<table class=monitorresreq>
<tr class=title><td>Requirement</td><td>Requested</td><td>Done</td></tr>
<tr><td>Cpucount</td><td>%(cpucount_requested)s</td><td>%(cpucount_done)s</td></tr>
<tr><td>Nodecount</td><td>%(nodecount_requested)s</td><td>%(nodecount_done)s</td></tr>
<tr><td>Cputime</td><td>%(cputime_requested)s</td><td>%(cputime_done)s</td></tr>
<tr><td>GB Disk</td><td>%(disk_requested)s</td><td>%(disk_done)s</td></tr>
<tr><td>MB Memory</td><td>%(memory_requested)s</td><td>%(memory_done)s</td></tr>
<tr><td>Runtime Envs</td><td>%(runtimeenv_requested)s</td><td>%(runtimeenv_done)s</td></tr>
<tr><td>Used Walltime</td><td colspan='2'>%(used_walltime)s</td></tr>
</table><br />
</td><td>
<div class=monitorruntimeenvdetails>
<table class=monitorruntimeenvdone>
<tr class=title><td>Runtime Envs Done</td><td></td></tr>
"""
        % html_vars
    )

    if len(runtimeenv_dict.keys()) < 1:

        # No runtimeenv requests

        html += "<tr><td></td><td>-</td></tr>\n"
    else:
        for entry in runtimeenv_dict.keys():
            if not entry == "":
                html += "<tr><td>" + entry + "</td><td>" + str(runtimeenv_dict[entry]) + "</td></tr>\n"

    total_number_of_exe_resources, total_number_of_store_resources = 0, 0
    total_number_of_exe_cpus, total_number_of_store_gigs = 0, 0

    vgrid_name_list = vgrid_name.split("/")
    current_dir = ""

    exes, stores = "", ""
    for vgrid_name_part in vgrid_name_list:
        current_dir = os.path.join(current_dir, vgrid_name_part)
        abs_mon_dir = os.path.join(configuration.vgrid_home, current_dir)
        # print 'dir: %s' % abs_mon_dir
        # Potential race - just ignore if it disappeared
        try:
            sorted_names = os.listdir(abs_mon_dir)
        except OSError:
            continue
        sorted_names.sort()
        for filename in sorted_names:
            # print filename
            if filename.startswith("monitor_last_request_"):

                # read last request helper file

                mon_file_name = os.path.join(abs_mon_dir, filename)
                print "found " + mon_file_name
                last_request_dict = unpickle(mon_file_name, logger)
                if not last_request_dict:
                    print "could not open and unpickle: " + mon_file_name
                    continue

                difference = datetime.datetime.now() - last_request_dict["CREATED_TIME"]
                days = str(difference.days)
                hours = str(difference.seconds / 3600)
                minutes = str((difference.seconds % 3600) / 60)
                seconds = str((difference.seconds % 60) % 60)

                if last_request_dict.has_key("CPUTIME"):
                    cputime = last_request_dict["CPUTIME"]
                elif last_request_dict.has_key("cputime"):
                    cputime = last_request_dict["cputime"]
                else:
                    print "ERROR: last request does not contain cputime field!: %s" % last_request_dict
                    continue

                try:
                    cpusec = int(cputime)
                except ValueError:
                    try:
                        cpusec = int(float(cputime))
                    except ValueError, verr:
                        print "ERROR: failed to parse cputime %s: %s" % (cputime, verr)

                # Include execution delay guesstimate for strict fill
                # LRMS resources

                try:
                    delay = int(last_request_dict["EXECUTION_DELAY"])
                except KeyError:
                    delay = 0
                except ValueError:
                    delay = 0

                time_remaining = (
                    last_request_dict["CREATED_TIME"]
                    + datetime.timedelta(seconds=cpusec)
                    + datetime.timedelta(seconds=delay)
                ) - datetime.datetime.now()
                days_rem = str(time_remaining.days)
                hours_rem = str(time_remaining.seconds / 3600)
                minutes_rem = str((time_remaining.seconds % 3600) / 60)
                seconds_rem = str((time_remaining.seconds % 60) % 60)

                if time_remaining.days < -7:
                    try:
                        print "removing: %s as we havent seen him for %s days." % (
                            mon_file_name,
                            abs(time_remaining).days,
                        )
                        os.remove(mon_file_name)
                    except Exception, err:
                        print "could not remove: '%s' Error: %s" % (mon_file_name, str(err))
                    pass
                else:
                    unique_res_name_and_exe_list = filename.split("monitor_last_request_", 1)
                    if cpusec == 0:
                        resource_status = "unavailable"
                    elif time_remaining.days < 0:

                        # time_remaining.days < 0 means that we have passed the specified time

                        time_rem_abs = abs(time_remaining)
                        if time_rem_abs.days == 0 and int(time_rem_abs.seconds) < int(slackperiod):
                            resource_status = "slack"
                            slack_count = slack_count + 1
                        else:
                            resource_status = "offline"
                            down_count = down_count + 1
                    else:
                        resource_status = "online"
                        up_count = up_count + 1

                    exes += "<tr>"
                    exes += "<td><img src=/images/status-icons/%s.png /></td>" % resource_status
                    public_id = unique_res_name_and_exe_list[1]
                    if last_request_dict["RESOURCE_CONFIG"].get("ANONYMOUS", True):
                        public_id = anon_resource_id(public_id)
                    public_name = last_request_dict["RESOURCE_CONFIG"].get("PUBLICNAME", "")
                    resource_parts = public_id.split("_", 2)
                    resource_name = "<a href='viewres.py?unique_resource_name=%s'>%s</a>" % (
                        resource_parts[0],
                        resource_parts[0],
                    )
                    if public_name:
                        resource_name += "<br />(alias %s)" % public_name
                    else:
                        resource_name += "<br />(no alias)"
                    resource_name += "<br />%s" % resource_parts[1]
                    exes += "<td>%s</td>" % resource_name

                    exes += "<td>%s<br />(%sd %sh %sm %ss ago)</td>" % (
                        time.asctime(last_request_dict["CREATED_TIME"].timetuple()),
                        days,
                        hours,
                        minutes,
                        seconds,
                    )
                    exes += "<td>" + vgrid_name + "</td>"
                    runtime_envs = last_request_dict["RESOURCE_CONFIG"]["RUNTIMEENVIRONMENT"]
                    re_list_text = ", ".join([i[0] for i in runtime_envs])
                    exes += '<td title="%s">' % re_list_text + str(len(runtime_envs)) + "</td>"
                    exes += (
                        "<td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["CPUTIME"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["DISK"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["MEMORY"])
                        + "</td><td>"
                        + str(last_request_dict["RESOURCE_CONFIG"]["ARCHITECTURE"])
                        + "</td>"
                    )
                    exes += (
                        "<td>" + last_request_dict["STATUS"] + "</td><td>" + str(last_request_dict["CPUTIME"]) + "</td>"
                    )

                    exes += "<td class=status_%s>" % resource_status
                    if "unavailable" == resource_status:
                        exes += "-"
                    elif "slack" == resource_status:
                        exes += "Within slack period (%s < %s secs)" % (time_rem_abs.seconds, slackperiod)
                    elif "offline" == resource_status:
                        exes += "down?"
                    else:
                        exes += "%sd, %sh, %sm, %ss" % (days_rem, hours_rem, minutes_rem, seconds_rem)
                    exes += "</td>"

                    exes += "</tr>\n"
                    if last_request_dict["STATUS"] == "Job assigned":
                        job_assigned = job_assigned + 1
                        job_assigned_cpus = job_assigned_cpus + int(
                            last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]
                        ) * int(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"])

                    total_number_of_exe_resources += 1
                    total_number_of_exe_cpus += int(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]) * int(
                        last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"]
                    )
            elif filename.startswith("monitor_last_status_"):

                # store must be linked to this vgrid, not only parent vgrid:
                # inheritance only covers access, not automatic participation

                if current_dir != vgrid_name:
                    continue

                # read last resource action status file

                mon_file_name = os.path.join(abs_mon_dir, filename)
                print "found " + mon_file_name
                last_status_dict = unpickle(mon_file_name, logger)
                if not last_status_dict:
                    print "could not open and unpickle: " + mon_file_name
                    continue

                difference = datetime.datetime.now() - last_status_dict["CREATED_TIME"]
                days = str(difference.days)
                hours = str(difference.seconds / 3600)
                minutes = str((difference.seconds % 3600) / 60)
                seconds = str((difference.seconds % 60) % 60)

                if last_status_dict["STATUS"] == "stopped":
                    time_stopped = datetime.datetime.now() - last_status_dict["CREATED_TIME"]
                    if time_stopped.days > 7:
                        try:
                            print "removing: %s as we havent seen him for %s days." % (
                                mon_file_name,
                                abs(time_stopped).days,
                            )
                            os.remove(mon_file_name)
                        except Exception, err:
                            print "could not remove: '%s' Error: %s" % (mon_file_name, str(err))
                        continue