jobs_per_resource = 0 jobs_per_user = 0 resources_walltime = {} walltime_per_resource = timedelta(0) walltime_per_user = timedelta(0) # loop through all resources of each user for resource in userdb[username][RESOURCES]: # now find number of jobs successfully executed by resource jobs_per_resource = \ grid_stat.get_value(grid_stat.RESOURCE_TOTAL, resource, 'FINISHED') jobs_per_user += jobs_per_resource n = {resource: jobs_per_resource} resources_jobs.update(n) walltime_per_resource = \ grid_stat.get_value(grid_stat.RESOURCE_TOTAL, resource, 'USED_WALLTIME') if walltime_per_resource != 0: if not walltime_per_user: walltime_per_user = walltime_per_resource else: walltime_per_user += walltime_per_resource else: walltime_per_resource = timedelta(0)
def create_monitor(vgrid_name): """Write monitor HTML file for vgrid_name""" html_file = os.path.join(configuration.vgrid_home, vgrid_name, "%s.html" % configuration.vgrid_monitor) print "collecting statistics for VGrid %s" % vgrid_name sleep_secs = configuration.sleep_secs slackperiod = configuration.slackperiod now = time.asctime(time.localtime()) html_vars = { "sleep_secs": sleep_secs, "vgrid_name": vgrid_name, "logo_url": "/images/logo.jpg", "now": now, "short_title": configuration.short_title, } html = get_cgi_html_header( configuration, "%(short_title)s Monitor, VGrid %(vgrid_name)s" % html_vars, "", True, """<meta http-equiv="refresh" content="%(sleep_secs)s" /> """ % html_vars, themed_styles(configuration), """ <script type="text/javascript" src="/images/js/jquery.js"></script> <script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script> <script type="text/javascript" > $(document).ready(function() { // table initially sorted by col. 1 (name) var sortOrder = [[1,0]]; // use image path for sorting if there is any inside var imgTitle = function(contents) { var key = $(contents).find("a").attr("class"); if (key == null) { key = $(contents).html(); } return key; } $("table.monitor").tablesorter({widgets: ["zebra"], textExtraction: imgTitle, }); $("table.monitor").each(function () { try { $(this).trigger("sorton", [sortOrder]); } catch(err) { /* tablesorter chokes on empty tables - just continue */ } }); } ); </script> """, "", False, ) html += ( """ <!-- end of raw header: this line is used by showvgridmonitor --> <h1>Statistics/monitor for the %(vgrid_name)s VGrid</h1> <div class="generatornote smallcontent"> This page was generated %(now)s (automatic refresh every %(sleep_secs)s secs). </div> """ % html_vars ) # loop and get totals parse_count = 0 queued_count = 0 frozen_count = 0 executing_count = 0 finished_count = 0 failed_count = 0 retry_count = 0 canceled_count = 0 cpucount_requested = 0 cpucount_done = 0 nodecount_requested = 0 nodecount_done = 0 cputime_requested = 0 cputime_done = 0 used_walltime = 0 disk_requested = 0 disk_done = 0 memory_requested = 0 memory_done = 0 runtimeenv_dict = {"": 0} runtimeenv_requested = 0 runtimeenv_done = 0 number_of_jobs = 0 up_count = 0 down_count = 0 slack_count = 0 job_assigned = 0 job_assigned_cpus = 0 gstat = GridStat(configuration, logger) runtimeenv_dict = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT", {}) parse_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "PARSE") queued_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "QUEUED") frozen_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FROZEN") executing_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXECUTING") failed_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FAILED") retry_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RETRY") canceled_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CANCELED") expired_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXPIRED") finished_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FINISHED") nodecount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_REQ") nodecount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_DONE") cputime_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_REQ") cputime_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_DONE") used_walltime = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "USED_WALLTIME") if used_walltime == 0: used_walltime = datetime.timedelta(0) used_walltime = format_timedelta(used_walltime) disk_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_REQ") disk_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_DONE") memory_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_REQ") memory_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_DONE") cpucount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_REQ") cpucount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_DONE") runtimeenv_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_REQ") runtimeenv_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_DONE") number_of_jobs = parse_count number_of_jobs += queued_count number_of_jobs += frozen_count number_of_jobs += expired_count number_of_jobs += canceled_count number_of_jobs += failed_count number_of_jobs += executing_count number_of_jobs += finished_count number_of_jobs += retry_count html_vars = { "parse_count": parse_count, "queued_count": queued_count, "frozen_count": frozen_count, "executing_count": executing_count, "failed_count": failed_count, "retry_count": retry_count, "canceled_count": canceled_count, "expired_count": expired_count, "finished_count": finished_count, "number_of_jobs": number_of_jobs, "cpucount_requested": cpucount_requested, "cpucount_done": cpucount_done, "nodecount_requested": nodecount_requested, "nodecount_done": nodecount_done, "cputime_requested": cputime_requested, "cputime_done": cputime_done, "used_walltime": used_walltime, "disk_requested": disk_requested, "disk_done": disk_done, "memory_requested": memory_requested, "memory_done": memory_done, "runtimeenv_requested": runtimeenv_requested, "runtimeenv_done": runtimeenv_done, } html += ( """<h2>Job Stats</h2><table class=monitorstats><tr><td> <table class=monitorjobs><tr class=title><td>Job State</td><td>Number of jobs</td></tr> <tr><td>Parse</td><td>%(parse_count)s</td></tr> <tr><td>Queued</td><td>%(queued_count)s</td></tr> <tr><td>Frozen</td><td>%(frozen_count)s</td></tr> <tr><td>Executing</td><td>%(executing_count)s</td></tr> <tr><td>Failed</td><td>%(failed_count)s</td></tr> <tr><td>Retry</td><td>%(retry_count)s</td></tr> <tr><td>Canceled</td><td>%(canceled_count)s</td></tr> <tr><td>Expired</td><td>%(expired_count)s</td></tr> <tr><td>Finished</td><td>%(finished_count)s</td></tr> <tr><td>Total</td><td>%(number_of_jobs)s</td></tr> </table> </td><td> <table class=monitorresreq> <tr class=title><td>Requirement</td><td>Requested</td><td>Done</td></tr> <tr><td>Cpucount</td><td>%(cpucount_requested)s</td><td>%(cpucount_done)s</td></tr> <tr><td>Nodecount</td><td>%(nodecount_requested)s</td><td>%(nodecount_done)s</td></tr> <tr><td>Cputime</td><td>%(cputime_requested)s</td><td>%(cputime_done)s</td></tr> <tr><td>GB Disk</td><td>%(disk_requested)s</td><td>%(disk_done)s</td></tr> <tr><td>MB Memory</td><td>%(memory_requested)s</td><td>%(memory_done)s</td></tr> <tr><td>Runtime Envs</td><td>%(runtimeenv_requested)s</td><td>%(runtimeenv_done)s</td></tr> <tr><td>Used Walltime</td><td colspan='2'>%(used_walltime)s</td></tr> </table><br /> </td><td> <div class=monitorruntimeenvdetails> <table class=monitorruntimeenvdone> <tr class=title><td>Runtime Envs Done</td><td></td></tr> """ % html_vars ) if len(runtimeenv_dict.keys()) < 1: # No runtimeenv requests html += "<tr><td></td><td>-</td></tr>\n" else: for entry in runtimeenv_dict.keys(): if not entry == "": html += "<tr><td>" + entry + "</td><td>" + str(runtimeenv_dict[entry]) + "</td></tr>\n" total_number_of_exe_resources, total_number_of_store_resources = 0, 0 total_number_of_exe_cpus, total_number_of_store_gigs = 0, 0 vgrid_name_list = vgrid_name.split("/") current_dir = "" exes, stores = "", "" for vgrid_name_part in vgrid_name_list: current_dir = os.path.join(current_dir, vgrid_name_part) abs_mon_dir = os.path.join(configuration.vgrid_home, current_dir) # print 'dir: %s' % abs_mon_dir # Potential race - just ignore if it disappeared try: sorted_names = os.listdir(abs_mon_dir) except OSError: continue sorted_names.sort() for filename in sorted_names: # print filename if filename.startswith("monitor_last_request_"): # read last request helper file mon_file_name = os.path.join(abs_mon_dir, filename) print "found " + mon_file_name last_request_dict = unpickle(mon_file_name, logger) if not last_request_dict: print "could not open and unpickle: " + mon_file_name continue difference = - last_request_dict["CREATED_TIME"] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) if last_request_dict.has_key("CPUTIME"): cputime = last_request_dict["CPUTIME"] elif last_request_dict.has_key("cputime"): cputime = last_request_dict["cputime"] else: print "ERROR: last request does not contain cputime field!: %s" % last_request_dict continue try: cpusec = int(cputime) except ValueError: try: cpusec = int(float(cputime)) except ValueError, verr: print "ERROR: failed to parse cputime %s: %s" % (cputime, verr) # Include execution delay guesstimate for strict fill # LRMS resources try: delay = int(last_request_dict["EXECUTION_DELAY"]) except KeyError: delay = 0 except ValueError: delay = 0 time_remaining = ( last_request_dict["CREATED_TIME"] + datetime.timedelta(seconds=cpusec) + datetime.timedelta(seconds=delay) ) - days_rem = str(time_remaining.days) hours_rem = str(time_remaining.seconds / 3600) minutes_rem = str((time_remaining.seconds % 3600) / 60) seconds_rem = str((time_remaining.seconds % 60) % 60) if time_remaining.days < -7: try: print "removing: %s as we havent seen him for %s days." % ( mon_file_name, abs(time_remaining).days, ) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s" % (mon_file_name, str(err)) pass else: unique_res_name_and_exe_list = filename.split("monitor_last_request_", 1) if cpusec == 0: resource_status = "unavailable" elif time_remaining.days < 0: # time_remaining.days < 0 means that we have passed the specified time time_rem_abs = abs(time_remaining) if time_rem_abs.days == 0 and int(time_rem_abs.seconds) < int(slackperiod): resource_status = "slack" slack_count = slack_count + 1 else: resource_status = "offline" down_count = down_count + 1 else: resource_status = "online" up_count = up_count + 1 exes += "<tr>" exes += "<td><img src=/images/status-icons/%s.png /></td>" % resource_status public_id = unique_res_name_and_exe_list[1] if last_request_dict["RESOURCE_CONFIG"].get("ANONYMOUS", True): public_id = anon_resource_id(public_id) public_name = last_request_dict["RESOURCE_CONFIG"].get("PUBLICNAME", "") resource_parts = public_id.split("_", 2) resource_name = "<a href=''>%s</a>" % ( resource_parts[0], resource_parts[0], ) if public_name: resource_name += "<br />(alias %s)" % public_name else: resource_name += "<br />(no alias)" resource_name += "<br />%s" % resource_parts[1] exes += "<td>%s</td>" % resource_name exes += "<td>%s<br />(%sd %sh %sm %ss ago)</td>" % ( time.asctime(last_request_dict["CREATED_TIME"].timetuple()), days, hours, minutes, seconds, ) exes += "<td>" + vgrid_name + "</td>" runtime_envs = last_request_dict["RESOURCE_CONFIG"]["RUNTIMEENVIRONMENT"] re_list_text = ", ".join([i[0] for i in runtime_envs]) exes += '<td title="%s">' % re_list_text + str(len(runtime_envs)) + "</td>" exes += ( "<td>" + str(last_request_dict["RESOURCE_CONFIG"]["CPUTIME"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["DISK"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["MEMORY"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["ARCHITECTURE"]) + "</td>" ) exes += ( "<td>" + last_request_dict["STATUS"] + "</td><td>" + str(last_request_dict["CPUTIME"]) + "</td>" ) exes += "<td class=status_%s>" % resource_status if "unavailable" == resource_status: exes += "-" elif "slack" == resource_status: exes += "Within slack period (%s < %s secs)" % (time_rem_abs.seconds, slackperiod) elif "offline" == resource_status: exes += "down?" else: exes += "%sd, %sh, %sm, %ss" % (days_rem, hours_rem, minutes_rem, seconds_rem) exes += "</td>" exes += "</tr>\n" if last_request_dict["STATUS"] == "Job assigned": job_assigned = job_assigned + 1 job_assigned_cpus = job_assigned_cpus + int( last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"] ) * int(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"]) total_number_of_exe_resources += 1 total_number_of_exe_cpus += int(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]) * int( last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"] ) elif filename.startswith("monitor_last_status_"): # store must be linked to this vgrid, not only parent vgrid: # inheritance only covers access, not automatic participation if current_dir != vgrid_name: continue # read last resource action status file mon_file_name = os.path.join(abs_mon_dir, filename) print "found " + mon_file_name last_status_dict = unpickle(mon_file_name, logger) if not last_status_dict: print "could not open and unpickle: " + mon_file_name continue difference = - last_status_dict["CREATED_TIME"] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) if last_status_dict["STATUS"] == "stopped": time_stopped = - last_status_dict["CREATED_TIME"] if time_stopped.days > 7: try: print "removing: %s as we havent seen him for %s days." % ( mon_file_name, abs(time_stopped).days, ) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s" % (mon_file_name, str(err)) continue
def create_monitor(vgrid_name): """Write monitor HTML file for vgrid_name""" html_file = os.path.join(configuration.vgrid_home, vgrid_name, '%s.html' % configuration.vgrid_monitor) print 'collecting statistics for VGrid %s' % vgrid_name sleep_secs = configuration.sleep_secs slackperiod = configuration.slackperiod now = time.asctime(time.localtime()) html_vars = { 'sleep_secs': sleep_secs, 'vgrid_name': vgrid_name, 'logo_url': '/images/logo.jpg', 'now': now, 'short_title': configuration.short_title, } monitor_meta = '''<meta http-equiv="refresh" content="%(sleep_secs)s" /> ''' % html_vars add_import = ''' <script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script> ''' add_init = '' add_ready = ''' // table initially sorted by col. 1 (name) var sortOrder = [[1,0]]; // use image path for sorting if there is any inside var imgTitle = function(contents) { var key = $(contents).find("a").attr("class"); if (key == null) { key = $(contents).html(); } return key; } $("table.monitor").tablesorter({widgets: ["zebra"], textExtraction: imgTitle, }); $("table.monitor").each(function () { try { $(this).trigger("sorton", [sortOrder]); } catch(err) { /* tablesorter chokes on empty tables - just continue */ } }); ''' monitor_js = ''' %s <script type="text/javascript" > %s $(document).ready(function() { %s } ); </script> ''' % (add_import, add_init, add_ready) # User default site style style_helpers = themed_styles(configuration) script_helpers = themed_scripts(configuration) script_helpers['advanced'] += add_import script_helpers['init'] += add_init script_helpers['ready'] += add_ready html = get_xgi_html_header( configuration, '%(short_title)s Monitor, VGrid %(vgrid_name)s' % html_vars, '', html=True, meta=monitor_meta, style_map=style_helpers, script_map=script_helpers, frame=False, menu=False, widgets=False, userstyle=False, ) html += \ ''' <!-- end of raw header: this line is used by showvgridmonitor --> <h1>Statistics/monitor for the %(vgrid_name)s VGrid</h1> <div class="generatornote smallcontent"> This page was generated %(now)s (automatic refresh every %(sleep_secs)s secs). </div> '''\ % html_vars # loop and get totals parse_count = 0 queued_count = 0 frozen_count = 0 executing_count = 0 finished_count = 0 failed_count = 0 retry_count = 0 canceled_count = 0 cpucount_requested = 0 cpucount_done = 0 nodecount_requested = 0 nodecount_done = 0 cputime_requested = 0 cputime_done = 0 used_walltime = 0 disk_requested = 0 disk_done = 0 memory_requested = 0 memory_done = 0 runtimeenv_dict = {'': 0} runtimeenv_requested = 0 runtimeenv_done = 0 number_of_jobs = 0 up_count = 0 down_count = 0 slack_count = 0 job_assigned = 0 job_assigned_cpus = 0 gstat = GridStat(configuration, logger) runtimeenv_dict = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RUNTIMEENVIRONMENT', {}) parse_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'PARSE') queued_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'QUEUED') frozen_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FROZEN') executing_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'EXECUTING') failed_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FAILED') retry_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RETRY') canceled_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CANCELED') expired_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'EXPIRED') finished_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FINISHED') nodecount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'NODECOUNT_REQ') nodecount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'NODECOUNT_DONE') cputime_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUTIME_REQ') cputime_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUTIME_DONE') used_walltime = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'USED_WALLTIME') if (used_walltime == 0): used_walltime = datetime.timedelta(0) used_walltime = format_timedelta(used_walltime) disk_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'DISK_REQ') disk_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'DISK_DONE') memory_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'MEMORY_REQ') memory_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'MEMORY_DONE') cpucount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUCOUNT_REQ') cpucount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUCOUNT_DONE') runtimeenv_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RUNTIMEENVIRONMENT_REQ') runtimeenv_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RUNTIMEENVIRONMENT_DONE') number_of_jobs = parse_count number_of_jobs += queued_count number_of_jobs += frozen_count number_of_jobs += expired_count number_of_jobs += canceled_count number_of_jobs += failed_count number_of_jobs += executing_count number_of_jobs += finished_count number_of_jobs += retry_count html_vars = { 'parse_count': parse_count, 'queued_count': queued_count, 'frozen_count': frozen_count, 'executing_count': executing_count, 'failed_count': failed_count, 'retry_count': retry_count, 'canceled_count': canceled_count, 'expired_count': expired_count, 'finished_count': finished_count, 'number_of_jobs': number_of_jobs, 'cpucount_requested': cpucount_requested, 'cpucount_done': cpucount_done, 'nodecount_requested': nodecount_requested, 'nodecount_done': nodecount_done, 'cputime_requested': cputime_requested, 'cputime_done': cputime_done, 'used_walltime': used_walltime, 'disk_requested': disk_requested, 'disk_done': disk_done, 'memory_requested': memory_requested, 'memory_done': memory_done, 'runtimeenv_requested': runtimeenv_requested, 'runtimeenv_done': runtimeenv_done, } html += \ """<h2>Job Stats</h2><table class=monitorstats><tr><td> <table class=monitorjobs><tr class=title><td>Job State</td><td>Number of jobs</td></tr> <tr><td>Parse</td><td>%(parse_count)s</td></tr> <tr><td>Queued</td><td>%(queued_count)s</td></tr> <tr><td>Frozen</td><td>%(frozen_count)s</td></tr> <tr><td>Executing</td><td>%(executing_count)s</td></tr> <tr><td>Failed</td><td>%(failed_count)s</td></tr> <tr><td>Retry</td><td>%(retry_count)s</td></tr> <tr><td>Canceled</td><td>%(canceled_count)s</td></tr> <tr><td>Expired</td><td>%(expired_count)s</td></tr> <tr><td>Finished</td><td>%(finished_count)s</td></tr> <tr><td>Total</td><td>%(number_of_jobs)s</td></tr> </table> </td><td> <table class=monitorresreq> <tr class=title><td>Requirement</td><td>Requested</td><td>Done</td></tr> <tr><td>Cpucount</td><td>%(cpucount_requested)s</td><td>%(cpucount_done)s</td></tr> <tr><td>Nodecount</td><td>%(nodecount_requested)s</td><td>%(nodecount_done)s</td></tr> <tr><td>Cputime</td><td>%(cputime_requested)s</td><td>%(cputime_done)s</td></tr> <tr><td>GB Disk</td><td>%(disk_requested)s</td><td>%(disk_done)s</td></tr> <tr><td>MB Memory</td><td>%(memory_requested)s</td><td>%(memory_done)s</td></tr> <tr><td>Runtime Envs</td><td>%(runtimeenv_requested)s</td><td>%(runtimeenv_done)s</td></tr> <tr><td>Used Walltime</td><td colspan='2'>%(used_walltime)s</td></tr> </table><br /> </td><td> <div class=monitorruntimeenvdetails> <table class=monitorruntimeenvdone> <tr class=title><td>Runtime Envs Done</td><td></td></tr> """\ % html_vars if len(runtimeenv_dict.keys()) < 1: # No runtimeenv requests html += '<tr><td></td><td>-</td></tr>\n' else: for entry in runtimeenv_dict.keys(): if not entry == '': html += '<tr><td>' + entry + '</td><td>'\ + str(runtimeenv_dict[entry]) + '</td></tr>\n' total_number_of_exe_resources, total_number_of_store_resources = 0, 0 total_number_of_exe_cpus, total_number_of_store_gigs = 0, 0 vgrid_name_list = vgrid_name.split('/') current_dir = '' exes, stores = '', '' for vgrid_name_part in vgrid_name_list: current_dir = os.path.join(current_dir, vgrid_name_part) abs_mon_dir = os.path.join(configuration.vgrid_home, current_dir) # print 'dir: %s' % abs_mon_dir # Potential race - just ignore if it disappeared try: sorted_names = os.listdir(abs_mon_dir) except OSError: continue sorted_names.sort() for filename in sorted_names: # print filename if filename.startswith('monitor_last_request_'): # read last request helper file mon_file_name = os.path.join(abs_mon_dir, filename) print 'found ' + mon_file_name last_request_dict = unpickle(mon_file_name, logger) if not last_request_dict: print 'could not open and unpickle: '\ + mon_file_name continue if not last_request_dict.has_key('CREATED_TIME'): print 'skip broken last request dict: '\ + mon_file_name continue difference =\ - last_request_dict['CREATED_TIME'] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) last_timetuple = last_request_dict['CREATED_TIME'].timetuple() if last_request_dict.has_key('CPUTIME'): cputime = last_request_dict['CPUTIME'] elif last_request_dict.has_key('cputime'): cputime = last_request_dict['cputime'] else: print 'ERROR: last request does not contain cputime field!: %s'\ % last_request_dict continue try: cpusec = int(cputime) except ValueError: try: cpusec = int(float(cputime)) except ValueError, verr: print 'ERROR: failed to parse cputime %s: %s'\ % (cputime, verr) # Include execution delay guesstimate for strict fill # LRMS resources try: delay = int(last_request_dict['EXECUTION_DELAY']) except KeyError: delay = 0 except ValueError: delay = 0 time_remaining = (last_request_dict['CREATED_TIME'] + datetime.timedelta(seconds=cpusec) + datetime.timedelta(seconds=delay))\ - days_rem = str(time_remaining.days) hours_rem = str(time_remaining.seconds / 3600) minutes_rem = str((time_remaining.seconds % 3600) / 60) seconds_rem = str((time_remaining.seconds % 60) % 60) if time_remaining.days < -7: try: print 'removing: %s as we havent seen him for %s days.'\ % (mon_file_name, abs(time_remaining).days) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s"\ % (mon_file_name, str(err)) pass else: unique_res_name_and_exe_list = \ filename.split('monitor_last_request_', 1) if cpusec == 0: resource_status = 'unavailable' elif time_remaining.days < 0: # time_remaining.days < 0 means that we have passed the specified time time_rem_abs = abs(time_remaining) if time_rem_abs.days == 0\ and int(time_rem_abs.seconds)\ < int(slackperiod): resource_status = 'slack' slack_count = slack_count + 1 else: resource_status = 'offline' down_count = down_count + 1 else: resource_status = 'online' up_count = up_count + 1 exes += '<tr>' exes += \ '<td><img src=/images/status-icons/%s.png /></td>'\ % resource_status public_id = unique_res_name_and_exe_list[1] if last_request_dict['RESOURCE_CONFIG'].get( 'ANONYMOUS', True): public_id = anon_resource_id(public_id) public_name = last_request_dict['RESOURCE_CONFIG'].get( 'PUBLICNAME', '') resource_parts = public_id.split('_', 2) resource_name = "<a href=''>%s</a>" % \ (resource_parts[0], resource_parts[0]) if public_name: resource_name += "<br />(alias %s)" % public_name else: resource_name += "<br />(no alias)" resource_name += "<br />%s" % resource_parts[1] exes += '<td>%s</td>' % resource_name last_asctime = time.asctime(last_timetuple) last_epoch = time.mktime(last_timetuple) exes += '<td><div class="sortkey">%s</div>%s<br />' % \ (last_epoch, last_asctime) exes += '(%sd %sh %sm %ss ago)</td>' % (days, hours, minutes, seconds) exes += '<td>' + vgrid_name + '</td>' runtime_envs = last_request_dict['RESOURCE_CONFIG'][ 'RUNTIMEENVIRONMENT'] runtime_envs.sort() re_list_text = ', '.join([i[0] for i in runtime_envs]) exes += '<td title="%s">' % re_list_text \ + str(len(runtime_envs)) + '</td>' exes += '<td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['CPUTIME']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['NODECOUNT']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['CPUCOUNT']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['DISK']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['MEMORY']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['ARCHITECTURE']) + '</td>' exes += '<td>' + last_request_dict['STATUS']\ + '</td><td>' + str(last_request_dict['CPUTIME' ]) + '</td>' exes += '<td class=status_%s>' % resource_status if 'unavailable' == resource_status: exes += '-' elif 'slack' == resource_status: exes += 'Within slack period (%s < %s secs)'\ % (time_rem_abs.seconds, slackperiod) elif 'offline' == resource_status: exes += 'down?' else: exes += '%sd, %sh, %sm, %ss'\ % (days_rem, hours_rem, minutes_rem, seconds_rem) exes += '</td>' exes += '</tr>\n' if last_request_dict['STATUS'] == 'Job assigned': job_assigned = job_assigned + 1 job_assigned_cpus = job_assigned_cpus\ + int(last_request_dict['RESOURCE_CONFIG' ]['NODECOUNT'])\ * int(last_request_dict['RESOURCE_CONFIG' ]['CPUCOUNT']) total_number_of_exe_resources += 1 total_number_of_exe_cpus += int( last_request_dict['RESOURCE_CONFIG']['NODECOUNT']) \ * int(last_request_dict['RESOURCE_CONFIG']['CPUCOUNT']) elif filename.startswith('monitor_last_status_'): # store must be linked to this vgrid, not only parent vgrid: # inheritance only covers access, not automatic participation if current_dir != vgrid_name: continue # read last resource action status file mon_file_name = os.path.join(abs_mon_dir, filename) print 'found ' + mon_file_name last_status_dict = unpickle(mon_file_name, logger) if not last_status_dict: print 'could not open and unpickle: '\ + mon_file_name continue if not last_status_dict.has_key('CREATED_TIME'): print 'skip broken last request dict: '\ + mon_file_name continue difference =\ - last_status_dict['CREATED_TIME'] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) if last_status_dict['STATUS'] == 'stopped': time_stopped = - \ last_status_dict['CREATED_TIME'] if time_stopped.days > 7: try: print 'removing: %s as we havent seen him for %s days.'\ % (mon_file_name, abs(time_stopped).days) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s"\ % (mon_file_name, str(err)) continue