def load_twofactor_session(configuration, session_key): """Load given twofactor session""" _logger = configuration.logger session_path = os.path.join(configuration.twofactor_home, session_key) # Use session file timestamp as default session start try: session_expire = os.stat(session_path).st_ctime + twofactor_cookie_ttl except Exception as exc: _logger.warning("Could not stat session_path %s: %s" % (session_path, exc)) return {} # NOTE: try to load pickle but with fallback to legacy plain file session_data = unpickle(session_path, _logger) if session_data: # new pickle format contains explicit session_end session_expire = session_data.get('session_end', session_expire) else: legacy_session = read_file(session_path, _logger) session_lines = legacy_session.split('\n') session_data = { 'user_agent': 'UNKNOWN', 'user_addr': 'UNKNOWN', 'client_id': 'UNKNOWN', 'session_end': session_expire, 'session_start': session_expire - twofactor_cookie_ttl } for (key, val) in zip(['user_agent', 'user_addr', 'client_id'], session_lines): session_data[key] = val.strip() return session_data
def recv_notification(configuration, path): """Read notification event from file""" logger = configuration.logger # logger.debug("read_notification: %s" % file) status = True new_notification = unpickle(path, logger) if not new_notification: logger.error("Failed to unpickle: %s" % path) return False user_id = new_notification.get('user_id', '') # logger.debug("Received user_id: '%s'" % user_id) if not user_id: status = False logger.error("Missing user_id in notification: %s" % path) else: client_id = expand_openid_alias(user_id, configuration) # logger.debug("resolved client_id: '%s'" % client_id) if not client_id or not extract_field(client_id, 'email'): status = False logger.error("Failed to resolve client_id from user_id: '%s'" % user_id) if status: category = new_notification.get('category', []) # logger.debug("Received category: %s" % category) if not isinstance(category, list): status = False logger.error("Received category: %s must be a list" % category) if status: logger.info("Received event: %s, from: '%s'" % (category, client_id)) new_timestamp = new_notification.get('timestamp') message = new_notification.get('message', '') # logger.debug("Received message: %s" % message) client_dict = received_notifications.get(client_id, {}) if not client_dict: received_notifications[client_id] = client_dict files_list = client_dict.get('files', []) if not files_list: client_dict['files'] = files_list if path in files_list: logger.warning("Skipping previously received notification: '%s'" % path) else: files_list.append(path) client_dict['timestamp'] = min( client_dict.get('timestamp', sys.maxsize), new_timestamp) messages_dict = client_dict.get('messages', {}) if not messages_dict: client_dict['messages'] = messages_dict header = " ".join(category) if not header: header = '* UNKNOWN *' body_dict = messages_dict.get(header, {}) if not body_dict: messages_dict[header] = body_dict message_count = body_dict.get(message, 0) body_dict[message] = message_count + 1 return status
def fill_triggers(configuration, vgrids_dict): """Search for system_imagesettings triggers and the needed information, such as rule_id, run_as and path to *vgrids_dict*""" status = True logger = configuration.logger logger.info(str(vgrids_dict.keys())) for key in vgrids_dict: logger.info('----------------------------------------------') logger.info('%s' % key) logger.info('----------------------------------------------') vgrid = vgrids_dict[key]['vgrid'] vgridpath = vgrids_dict[key]['vgridpath'] trigger_file = \ os.path.join(os.path.join(configuration.vgrid_home, vgrid), configuration.vgrid_triggers) if not os.path.exists(trigger_file): logger.warning("Missing trigger configuration: '%s'" % trigger_file) else: triggers = unpickle(trigger_file, logger) if not isinstance(triggers, list): status = False logger.error("Couldn't load trigger configuration: '%s'" % trigger_file) break for trigger in triggers: if trigger['rule_id'].startswith('system_imagesettings_'): vgridtriggerpath = get_vgridtriggerpath( vgrid, trigger['path']) if trigger['rule_id'] \ == 'system_imagesettings_meta_created' \ or trigger['rule_id'] \ == 'system_imagesettings_dir_deleted' \ or vgridtriggerpath == vgridpath: logger.info("vgrid: '%s'" % vgrid) logger.info("path: '%s'" % vgridpath) logger.info("rule_id: '%s'" % trigger['rule_id']) logger.info("run_as '%s'" % trigger['run_as']) logger.info( '----------------------------------------------') trigger = { 'rule_id': trigger['rule_id'], 'run_as': trigger['run_as'], 'path': vgridpath } vgrids_dict[key]['triggers'].append(trigger) return status
def main(): configuration = get_configuration_object() # Overwrite default logger logger = configuration.logger = get_logger(logging.INFO) logger = configuration.logger = get_logger(logging.INFO) vgrids_dict = unpickle(TRIGGER_DICT_FILE, logger) vgrid_list = get_vgrids_dict(vgrids_dict) for name in vgrid_list: print(name)
def get_resource_configuration(resource_home, unique_resource_name, logger): """Load a resource configuration from file""" # open the configuration file resource_config_file = resource_home + '/' + unique_resource_name\ + '/config' resource_config = unpickle(resource_config_file, logger) if not resource_config: msg = 'could not unpickle %s' % resource_config_file logger.error(msg) return (False, msg) else: return (True, resource_config)
def main(): status = True configuration = get_configuration_object() logger = configuration.logger = get_logger(logging.INFO) # Overwrite default logger argc = len(sys.argv) - 1 user_vgrid_list = None if argc == 1: user_vgrid_list = [vgrid.strip() for vgrid in sys.argv[1].split(',')] logger.info('Using custom vgrid_list: %s' % user_vgrid_list) vgrids_dict = unpickle(TRIGGER_DICT_FILE, logger) update_trigger_dict = None if vgrids_dict: (vgrids_dict, vgrid_list) = filter_vgrids_dict(configuration, vgrids_dict, user_vgrid_list) else: status = False logger.error("Missing vgrid dict file: '%s'" % TRIGGER_DICT_FILE) if status: status = backup_trigger_files(configuration, vgrid_list) if status: status = backup_imagesettings_files(configuration, vgrid_list) if status: status = backup_paraview_links(configuration, vgrid_list) if status: update_trigger_dict = \ get_update_trigger_dict_and_check_for_unique_clientid(configuration, vgrids_dict) if update_trigger_dict is None: status = False if status: status = remove_triggers(configuration, vgrids_dict) if status: status = update_backend(configuration, update_trigger_dict) if status: return 0 else: return 1
def create_monitor(vgrid_name): """Write monitor HTML file for vgrid_name""" html_file = os.path.join(configuration.vgrid_home, vgrid_name, '%s.html' % configuration.vgrid_monitor) print('collecting statistics for VGrid %s' % vgrid_name) sleep_secs = configuration.sleep_secs slackperiod = configuration.slackperiod now = time.asctime(time.localtime()) html_vars = { 'sleep_secs': sleep_secs, 'vgrid_name': vgrid_name, 'logo_url': '/images/logo.jpg', 'now': now, 'short_title': configuration.short_title, } monitor_meta = '''<meta http-equiv="refresh" content="%(sleep_secs)s" /> ''' % html_vars add_import = ''' <script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script> ''' add_init = '' add_ready = ''' // table initially sorted by col. 1 (name) var sortOrder = [[1,0]]; // use image path for sorting if there is any inside var imgTitle = function(contents) { var key = $(contents).find("a").attr("class"); if (key == null) { key = $(contents).html(); } return key; } $("table.monitor").tablesorter({widgets: ["zebra"], textExtraction: imgTitle, }); $("table.monitor").each(function () { try { $(this).trigger("sorton", [sortOrder]); } catch(err) { /* tablesorter chokes on empty tables - just continue */ } }); ''' monitor_js = ''' %s <script type="text/javascript" > %s $(document).ready(function() { %s } ); </script> ''' % (add_import, add_init, add_ready) # User default site style style_helpers = themed_styles(configuration) script_helpers = themed_scripts(configuration) script_helpers['advanced'] += add_import script_helpers['init'] += add_init script_helpers['ready'] += add_ready html = get_xgi_html_header( configuration, '%(short_title)s Monitor, VGrid %(vgrid_name)s' % html_vars, '', html=True, meta=monitor_meta, style_map=style_helpers, script_map=script_helpers, frame=False, menu=False, widgets=False, userstyle=False, ) html += \ ''' <!-- end of raw header: this line is used by showvgridmonitor --> <h1>Statistics/monitor for the %(vgrid_name)s VGrid</h1> <div class="generatornote smallcontent"> This page was generated %(now)s (automatic refresh every %(sleep_secs)s secs). </div> '''\ % html_vars # loop and get totals parse_count = 0 queued_count = 0 frozen_count = 0 executing_count = 0 finished_count = 0 failed_count = 0 retry_count = 0 canceled_count = 0 cpucount_requested = 0 cpucount_done = 0 nodecount_requested = 0 nodecount_done = 0 cputime_requested = 0 cputime_done = 0 used_walltime = 0 disk_requested = 0 disk_done = 0 memory_requested = 0 memory_done = 0 runtimeenv_dict = {'': 0} runtimeenv_requested = 0 runtimeenv_done = 0 number_of_jobs = 0 up_count = 0 down_count = 0 slack_count = 0 job_assigned = 0 job_assigned_cpus = 0 gstat = GridStat(configuration, logger) runtimeenv_dict = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RUNTIMEENVIRONMENT', {}) parse_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'PARSE') queued_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'QUEUED') frozen_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FROZEN') executing_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'EXECUTING') failed_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FAILED') retry_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RETRY') canceled_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CANCELED') expired_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'EXPIRED') finished_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FINISHED') nodecount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'NODECOUNT_REQ') nodecount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'NODECOUNT_DONE') cputime_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUTIME_REQ') cputime_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUTIME_DONE') used_walltime = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'USED_WALLTIME') if (used_walltime == 0): used_walltime = datetime.timedelta(0) used_walltime = format_timedelta(used_walltime) disk_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'DISK_REQ') disk_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'DISK_DONE') memory_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'MEMORY_REQ') memory_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'MEMORY_DONE') cpucount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUCOUNT_REQ') cpucount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUCOUNT_DONE') runtimeenv_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RUNTIMEENVIRONMENT_REQ') runtimeenv_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RUNTIMEENVIRONMENT_DONE') number_of_jobs = parse_count number_of_jobs += queued_count number_of_jobs += frozen_count number_of_jobs += expired_count number_of_jobs += canceled_count number_of_jobs += failed_count number_of_jobs += executing_count number_of_jobs += finished_count number_of_jobs += retry_count html_vars = { 'parse_count': parse_count, 'queued_count': queued_count, 'frozen_count': frozen_count, 'executing_count': executing_count, 'failed_count': failed_count, 'retry_count': retry_count, 'canceled_count': canceled_count, 'expired_count': expired_count, 'finished_count': finished_count, 'number_of_jobs': number_of_jobs, 'cpucount_requested': cpucount_requested, 'cpucount_done': cpucount_done, 'nodecount_requested': nodecount_requested, 'nodecount_done': nodecount_done, 'cputime_requested': cputime_requested, 'cputime_done': cputime_done, 'used_walltime': used_walltime, 'disk_requested': disk_requested, 'disk_done': disk_done, 'memory_requested': memory_requested, 'memory_done': memory_done, 'runtimeenv_requested': runtimeenv_requested, 'runtimeenv_done': runtimeenv_done, } html += \ """<h2>Job Stats</h2><table class=monitorstats><tr><td> <table class=monitorjobs><tr class=title><td>Job State</td><td>Number of jobs</td></tr> <tr><td>Parse</td><td>%(parse_count)s</td></tr> <tr><td>Queued</td><td>%(queued_count)s</td></tr> <tr><td>Frozen</td><td>%(frozen_count)s</td></tr> <tr><td>Executing</td><td>%(executing_count)s</td></tr> <tr><td>Failed</td><td>%(failed_count)s</td></tr> <tr><td>Retry</td><td>%(retry_count)s</td></tr> <tr><td>Canceled</td><td>%(canceled_count)s</td></tr> <tr><td>Expired</td><td>%(expired_count)s</td></tr> <tr><td>Finished</td><td>%(finished_count)s</td></tr> <tr><td>Total</td><td>%(number_of_jobs)s</td></tr> </table> </td><td> <table class=monitorresreq> <tr class=title><td>Requirement</td><td>Requested</td><td>Done</td></tr> <tr><td>Cpucount</td><td>%(cpucount_requested)s</td><td>%(cpucount_done)s</td></tr> <tr><td>Nodecount</td><td>%(nodecount_requested)s</td><td>%(nodecount_done)s</td></tr> <tr><td>Cputime</td><td>%(cputime_requested)s</td><td>%(cputime_done)s</td></tr> <tr><td>GB Disk</td><td>%(disk_requested)s</td><td>%(disk_done)s</td></tr> <tr><td>MB Memory</td><td>%(memory_requested)s</td><td>%(memory_done)s</td></tr> <tr><td>Runtime Envs</td><td>%(runtimeenv_requested)s</td><td>%(runtimeenv_done)s</td></tr> <tr><td>Used Walltime</td><td colspan='2'>%(used_walltime)s</td></tr> </table><br /> </td><td> <div class=monitorruntimeenvdetails> <table class=monitorruntimeenvdone> <tr class=title><td>Runtime Envs Done</td><td></td></tr> """\ % html_vars if len(runtimeenv_dict.keys()) < 1: # No runtimeenv requests html += '<tr><td></td><td>-</td></tr>\n' else: for entry in runtimeenv_dict.keys(): if not entry == '': html += '<tr><td>' + entry + '</td><td>'\ + str(runtimeenv_dict[entry]) + '</td></tr>\n' total_number_of_exe_resources, total_number_of_store_resources = 0, 0 total_number_of_exe_cpus, total_number_of_store_gigs = 0, 0 vgrid_name_list = vgrid_name.split('/') current_dir = '' exes, stores = '', '' for vgrid_name_part in vgrid_name_list: current_dir = os.path.join(current_dir, vgrid_name_part) abs_mon_dir = os.path.join(configuration.vgrid_home, current_dir) # print 'dir: %s' % abs_mon_dir # Potential race - just ignore if it disappeared try: sorted_names = os.listdir(abs_mon_dir) except OSError: continue sorted_names.sort() for filename in sorted_names: # print filename if filename.startswith('monitor_last_request_'): # read last request helper file mon_file_name = os.path.join(abs_mon_dir, filename) print('found ' + mon_file_name) last_request_dict = unpickle(mon_file_name, logger) if not last_request_dict: print('could not open and unpickle: ' + mon_file_name) continue if 'CREATED_TIME' not in last_request_dict: print('skip broken last request dict: ' + mon_file_name) continue difference = datetime.datetime.now()\ - last_request_dict['CREATED_TIME'] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) last_timetuple = last_request_dict['CREATED_TIME'].timetuple() if 'CPUTIME' in last_request_dict: cputime = last_request_dict['CPUTIME'] elif 'cputime' in last_request_dict: cputime = last_request_dict['cputime'] else: print( 'ERROR: last request does not contain cputime field!: %s' % last_request_dict) continue try: cpusec = int(cputime) except ValueError: try: cpusec = int(float(cputime)) except ValueError as verr: print('ERROR: failed to parse cputime %s: %s' % (cputime, verr)) # Include execution delay guesstimate for strict fill # LRMS resources try: delay = int(last_request_dict['EXECUTION_DELAY']) except KeyError: delay = 0 except ValueError: delay = 0 time_remaining = (last_request_dict['CREATED_TIME'] + datetime.timedelta(seconds=cpusec) + datetime.timedelta(seconds=delay))\ - datetime.datetime.now() days_rem = str(time_remaining.days) hours_rem = str(time_remaining.seconds / 3600) minutes_rem = str((time_remaining.seconds % 3600) / 60) seconds_rem = str((time_remaining.seconds % 60) % 60) if time_remaining.days < -7: try: print( 'removing: %s as we havent seen him for %s days.' % (mon_file_name, abs(time_remaining).days)) os.remove(mon_file_name) except Exception as err: print("could not remove: '%s' Error: %s" % (mon_file_name, str(err))) pass else: unique_res_name_and_exe_list = \ filename.split('monitor_last_request_', 1) if cpusec == 0: resource_status = 'unavailable' elif time_remaining.days < 0: # time_remaining.days < 0 means that we have passed the specified time time_rem_abs = abs(time_remaining) if time_rem_abs.days == 0\ and int(time_rem_abs.seconds)\ < int(slackperiod): resource_status = 'slack' slack_count = slack_count + 1 else: resource_status = 'offline' down_count = down_count + 1 else: resource_status = 'online' up_count = up_count + 1 exes += '<tr>' exes += \ '<td><img src=/images/status-icons/%s.png /></td>'\ % resource_status public_id = unique_res_name_and_exe_list[1] if last_request_dict['RESOURCE_CONFIG'].get( 'ANONYMOUS', True): public_id = anon_resource_id(public_id) public_name = last_request_dict['RESOURCE_CONFIG'].get( 'PUBLICNAME', '') resource_parts = public_id.split('_', 2) resource_name = "<a href='viewres.py?unique_resource_name=%s'>%s</a>" % \ (resource_parts[0], resource_parts[0]) if public_name: resource_name += "<br />(alias %s)" % public_name else: resource_name += "<br />(no alias)" resource_name += "<br />%s" % resource_parts[1] exes += '<td>%s</td>' % resource_name last_asctime = time.asctime(last_timetuple) last_epoch = time.mktime(last_timetuple) exes += '<td><div class="sortkey">%s</div>%s<br />' % \ (last_epoch, last_asctime) exes += '(%sd %sh %sm %ss ago)</td>' % (days, hours, minutes, seconds) exes += '<td>' + vgrid_name + '</td>' runtime_envs = last_request_dict['RESOURCE_CONFIG'][ 'RUNTIMEENVIRONMENT'] runtime_envs.sort() re_list_text = ', '.join([i[0] for i in runtime_envs]) exes += '<td title="%s">' % re_list_text \ + str(len(runtime_envs)) + '</td>' exes += '<td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['CPUTIME']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['NODECOUNT']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['CPUCOUNT']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['DISK']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['MEMORY']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['ARCHITECTURE']) + '</td>' exes += '<td>' + last_request_dict['STATUS']\ + '</td><td>' + str(last_request_dict['CPUTIME' ]) + '</td>' exes += '<td class=status_%s>' % resource_status if 'unavailable' == resource_status: exes += '-' elif 'slack' == resource_status: exes += 'Within slack period (%s < %s secs)'\ % (time_rem_abs.seconds, slackperiod) elif 'offline' == resource_status: exes += 'down?' else: exes += '%sd, %sh, %sm, %ss'\ % (days_rem, hours_rem, minutes_rem, seconds_rem) exes += '</td>' exes += '</tr>\n' if last_request_dict['STATUS'] == 'Job assigned': job_assigned = job_assigned + 1 job_assigned_cpus = job_assigned_cpus\ + int(last_request_dict['RESOURCE_CONFIG' ]['NODECOUNT'])\ * int(last_request_dict['RESOURCE_CONFIG' ]['CPUCOUNT']) total_number_of_exe_resources += 1 total_number_of_exe_cpus += int( last_request_dict['RESOURCE_CONFIG']['NODECOUNT']) \ * int(last_request_dict['RESOURCE_CONFIG']['CPUCOUNT']) elif filename.startswith('monitor_last_status_'): # store must be linked to this vgrid, not only parent vgrid: # inheritance only covers access, not automatic participation if current_dir != vgrid_name: continue # read last resource action status file mon_file_name = os.path.join(abs_mon_dir, filename) print('found ' + mon_file_name) last_status_dict = unpickle(mon_file_name, logger) if not last_status_dict: print('could not open and unpickle: ' + mon_file_name) continue if 'CREATED_TIME' not in last_status_dict: print('skip broken last request dict: ' + mon_file_name) continue difference = datetime.datetime.now()\ - last_status_dict['CREATED_TIME'] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) if last_status_dict['STATUS'] == 'stopped': time_stopped = datetime.datetime.now() - \ last_status_dict['CREATED_TIME'] if time_stopped.days > 7: try: print( 'removing: %s as we havent seen him for %s days.' % (mon_file_name, abs(time_stopped).days)) os.remove(mon_file_name) except Exception as err: print("could not remove: '%s' Error: %s" % (mon_file_name, str(err))) continue unique_res_name_and_store_list = filename.split( 'monitor_last_status_', 1) mount_point = last_status_dict.get('MOUNT_POINT', 'UNKNOWN') is_live = os.path.ismount(mount_point) public_id = unique_res_name_and_store_list[1] if last_status_dict['RESOURCE_CONFIG'].get('ANONYMOUS', True): public_id = anon_resource_id(public_id) vgrid_link = os.path.join(configuration.vgrid_files_home, vgrid_name, public_id) is_linked = (os.path.realpath(vgrid_link) == mount_point) total_disk = last_status_dict['RESOURCE_CONFIG']['DISK'] free_disk, avail_disk, used_disk, used_percent = 0, 0, 0, 0 gig_bytes = 1.0 * 2**30 # Fall back status - show last action unless statvfs succeeds last_status = last_status_dict['STATUS'] last_timetuple = last_status_dict['CREATED_TIME'].timetuple() # These disk stats are slightly confusing but match 'df' # 'available' is the space that can actually be used so it # is typically less than 'free'. try: disk_stats = os.statvfs(mount_point) total_disk = disk_stats.f_bsize * disk_stats.f_blocks / \ gig_bytes avail_disk = disk_stats.f_bsize * disk_stats.f_bavail / \ gig_bytes free_disk = disk_stats.f_bsize * disk_stats.f_bfree / \ gig_bytes used_disk = total_disk - free_disk used_percent = 100.0 * used_disk / (avail_disk + used_disk) last_status = 'checked' last_timetuple = datetime.datetime.now().timetuple() days, hours, minutes, seconds = 0, 0, 0, 0 except OSError as ose: print('could not stat mount point %s: %s' % (mount_point, ose)) is_live = False if last_status_dict['STATUS'] == 'stopped': resource_status = 'offline' down_count = down_count + 1 elif last_status_dict['STATUS'] == 'started': if is_live and is_linked: resource_status = 'online' up_count = up_count + 1 else: resource_status = 'slack' down_count = down_count + 1 else: resource_status = 'unknown' stores += '<tr>' stores += \ '<td><img src=/images/status-icons/%s.png /></td>'\ % resource_status public_name = last_status_dict['RESOURCE_CONFIG'].get( 'PUBLICNAME', '') resource_parts = public_id.split('_', 2) resource_name = "<a href='viewres.py?unique_resource_name=%s'>%s</a>" % \ (resource_parts[0], resource_parts[0]) if public_name: resource_name += "<br />(alias %s)" % public_name else: resource_name += "<br />(no alias)" resource_name += "<br />%s" % resource_parts[1] stores += '<td>%s</td>' % resource_name last_asctime = time.asctime(last_timetuple) last_epoch = time.mktime(last_timetuple) stores += '<td><div class="sortkey">%s</div>%s %s<br />' % \ (last_epoch, last_status, last_asctime) stores += '(%sd %sh %sm %ss ago)</td>' % (days, hours, minutes, seconds) stores += '<td>' + vgrid_name + '</td>' stores += '<td>%d</td>' % total_disk stores += '<td>%d</td>' % used_disk stores += '<td>%d</td>' % avail_disk stores += '<td>%d</td>' % used_percent stores += '<td class=status_%s>' % resource_status stores += resource_status + '</td>' stores += '</tr>\n' total_number_of_store_resources += 1 total_number_of_store_gigs += total_disk html += """</table> </div> </td></tr> </table> <h2>Resource Job Requests</h2> Listing the last request from each resource<br /> <br /> <table class="monitor columnsort"> <thead class="title"> <tr> <th class="icon"><!-- Status icon --></th> <th>Resource ID, unit</th> <th>Last seen</th> <th>VGrid</th> <th>Runtime envs</th> <th>CPU time (s)</th> <th>Node count</th> <th>CPU count</th> <th>Disk (GB)</th> <th>Memory (MB)</th> <th>Arch</th> <th>Status</th> <th>Job (s)</th> <th>Remaining</th> </tr> </thead> <tbody> """ html += exes html += '</tbody>\n</table>\n' html += """ <h2>Resource Storage</h2> Listing the last check for each resource<br /> <br /> <table class="monitor columnsort"> <thead class="title"> <tr> <th class="icon"><!-- Status icon --></th> <th>Resource ID, unit</th> <th>Last Status</th> <th>VGrid</th> <th>Total Disk (GB)</th> <th>Used Disk (GB)</th> <th>Available Disk (GB)</th> <th>Disk Use %</th> <th>Status</th> </tr> </thead> <tbody> """ html += stores html += '</tbody>\n</table>\n' html += ''' <h2>VGrid Totals</h2> A total of <b>'''\ + str(total_number_of_exe_resources) + '</b> exe resources ('\ + str(total_number_of_exe_cpus) + " cpu's) and <b>"\ + str(total_number_of_store_resources) + '</b> store resources ('\ + str(int(total_number_of_store_gigs)) + " GB) joined this VGrid ("\ + str(up_count) + ' up, ' + str(down_count) + ' down?, '\ + str(slack_count) + ' slack)<br />' html += str(job_assigned) + ' exe resources (' + str(job_assigned_cpus)\ + """ cpu's) appear to be executing a job<br /> <br /> """ html += \ '<!-- begin raw footer: this line is used by showvgridmonitor -->' html += get_xgi_html_footer(configuration, '') try: file_handle = open(html_file, 'w') file_handle.write(html) file_handle.close() except Exception as exc: print('Could not write monitor page %s: %s' % (html_file, exc))
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id) client_dir = client_id_dir(client_id) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) patterns = accepted['job_id'] if not safe_handler(configuration, 'post', op_name, client_id, get_csrf_limit(configuration), accepted): output_objects.append({ 'object_type': 'error_text', 'text': '''Only accepting CSRF-filtered POST requests to prevent unintended updates''' }) return (output_objects, returnvalues.CLIENT_ERROR) if not configuration.site_enable_jobs: output_objects.append({ 'object_type': 'error_text', 'text': '''Job execution is not enabled on this system''' }) return (output_objects, returnvalues.SYSTEM_ERROR) if not patterns: output_objects.append({ 'object_type': 'error_text', 'text': 'No job_id specified!' }) return (output_objects, returnvalues.NO_SUCH_JOB_ID) # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep filelist = [] keywords_dict = mrslkeywords.get_keywords_dict(configuration) for pattern in patterns: pattern = pattern.strip() # Backward compatibility - all_jobs keyword should match all jobs if pattern == all_jobs: pattern = '*' # Check directory traversal attempts before actual handling to avoid # leaking information about file system layout while allowing # consistent error messages unfiltered_match = glob.glob(base_dir + pattern + '.mRSL') match = [] for server_path in unfiltered_match: # IMPORTANT: path must be expanded to abs for proper chrooting abs_path = os.path.abspath(server_path) if not valid_user_path(configuration, abs_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning('%s tried to %s restricted path %s ! (%s)' % (client_id, op_name, abs_path, pattern)) continue # Insert valid job files in filelist for later treatment match.append(abs_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match if not match: output_objects.append({ 'object_type': 'error_text', 'text': '%s: You do not have any matching job IDs!' % pattern }) status = returnvalues.CLIENT_ERROR else: filelist += match # resubmit is hard on the server if len(filelist) > 100: output_objects.append({ 'object_type': 'error_text', 'text': 'Too many matching jobs (%s)!' % len(filelist) }) return (output_objects, returnvalues.CLIENT_ERROR) resubmitobjs = [] status = returnvalues.OK for filepath in filelist: mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') # ("Resubmitting job with job_id: %s" % job_id) resubmitobj = {'object_type': 'resubmitobj', 'job_id': job_id} mrsl_dict = unpickle(filepath, logger) if not mrsl_dict: resubmitobj['message'] = "No such job: %s (%s)" % (job_id, mrsl_file) status = returnvalues.CLIENT_ERROR resubmitobjs.append(resubmitobj) continue resubmit_items = keywords_dict.keys() # loop selected keywords and create mRSL string resubmit_job_string = '' for dict_elem in resubmit_items: value = '' # Extract job value with fallback to default to support optional # fields job_value = mrsl_dict.get(dict_elem, keywords_dict[dict_elem]['Value']) if keywords_dict[dict_elem]['Type'].startswith( 'multiplekeyvalues'): for (elem_key, elem_val) in job_value: if elem_key: value += '%s=%s\n' % (str(elem_key).strip(), str(elem_val).strip()) elif keywords_dict[dict_elem]['Type'].startswith('multiple'): for elem in job_value: if elem: value += '%s\n' % str(elem).rstrip() else: if str(job_value): value += '%s\n' % str(job_value).rstrip() # Only insert keywords with an associated value if value: if value.rstrip() != '': resubmit_job_string += '''::%s:: %s ''' % (dict_elem, value.rstrip()) # save tempfile (filehandle, tempfilename) = \ tempfile.mkstemp(dir=configuration.mig_system_files, text=True) os.write(filehandle, resubmit_job_string) os.close(filehandle) # submit job the usual way (new_job_status, msg, new_job_id) = new_job(tempfilename, client_id, configuration, False, True) if not new_job_status: resubmitobj['status'] = False resubmitobj['message'] = msg status = returnvalues.SYSTEM_ERROR resubmitobjs.append(resubmitobj) continue # o.out("Resubmit failed: %s" % msg) # o.reply_and_exit(o.ERROR) resubmitobj['status'] = True resubmitobj['new_job_id'] = new_job_id resubmitobjs.append(resubmitobj) # o.out("Resubmit successful: %s" % msg) # o.out("%s" % msg) output_objects.append({ 'object_type': 'resubmitobjs', 'resubmitobjs': resubmitobjs }) return (output_objects, status)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id) client_dir = client_id_dir(client_id) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) patterns = accepted['job_id'] if not safe_handler(configuration, 'post', op_name, client_id, get_csrf_limit(configuration), accepted): output_objects.append({ 'object_type': 'error_text', 'text': '''Only accepting CSRF-filtered POST requests to prevent unintended updates''' }) return (output_objects, returnvalues.CLIENT_ERROR) if not configuration.site_enable_jobs: output_objects.append({ 'object_type': 'error_text', 'text': '''Job execution is not enabled on this system''' }) return (output_objects, returnvalues.SYSTEM_ERROR) # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep status = returnvalues.OK filelist = [] for pattern in patterns: pattern = pattern.strip() # Backward compatibility - all_jobs keyword should match all jobs if pattern == all_jobs: pattern = '*' # Check directory traversal attempts before actual handling to # avoid leaking information about file system layout while # allowing consistent error messages unfiltered_match = glob.glob(base_dir + pattern + '.mRSL') match = [] for server_path in unfiltered_match: # IMPORTANT: path must be expanded to abs for proper chrooting abs_path = os.path.abspath(server_path) if not valid_user_path(configuration, abs_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning('%s tried to %s restricted path %s ! (%s)' % (client_id, op_name, abs_path, pattern)) continue # Insert valid job files in filelist for later treatment match.append(abs_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match^I if not match: output_objects.append({ 'object_type': 'error_text', 'text': '%s: You do not have any matching job IDs!' % pattern }) status = returnvalues.CLIENT_ERROR else: filelist += match # job feasibility is hard on the server, limit if len(filelist) > 100: output_objects.append({ 'object_type': 'error_text', 'text': 'Too many matching jobs (%s)!' % len(filelist) }) return (output_objects, returnvalues.CLIENT_ERROR) checkcondjobs = [] for filepath in filelist: # Extract job_id from filepath (replace doesn't modify filepath) mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') checkcondjob = {'object_type': 'checkcondjob', 'job_id': job_id} dict = unpickle(filepath, logger) if not dict: checkcondjob['message'] = \ ('The file containing the information ' \ 'for job id %s could not be opened! ' \ 'You can only check feasibility of ' \ 'your own jobs!' ) % job_id checkcondjobs.append(checkcondjob) status = returnvalues.CLIENT_ERROR continue # Is the job status pending? possible_check_states = ['QUEUED', 'RETRY', 'FROZEN'] if not dict['STATUS'] in possible_check_states: checkcondjob['message'] = \ 'You can only check feasibility of jobs with status: %s.'\ % ' or '.join(possible_check_states) checkcondjobs.append(checkcondjob) continue # Actually check feasibility feasible_res = job_feasibility(configuration, dict) checkcondjob.update(feasible_res) checkcondjobs.append(checkcondjob) output_objects.append({ 'object_type': 'checkcondjobs', 'checkcondjobs': checkcondjobs }) return (output_objects, status)
'requestinteractivejob error! unique_resource_name was not specified in the query string. Looks like a mis-configured resource!' ) o.reply_and_exit(o.ERROR) if localjobname == '': o.out( 'requestinteractivejob error! localjobname was not specified in the query string. Looks like a mis-configured resource!' ) o.reply_and_exit(o.ERROR) # TODO: check that the person who submitted the job (where the session ID points) is also the one that submitted the # received jobid (to avoid a verified user specifies another users job id) mrslfile = configuration.sessid_to_mrsl_link_home + sessionid + '.mRSL' mrsldict = unpickle(mrslfile, logger) if not mrsldict: o.out('requestinteractivejob error! Could not open mrsl file') o.reply_and_exit(o.ERROR) job_submitter_client_id = mrsldict['USER_CERT'] o.out('job_submitter_client_id: %s' % job_submitter_client_id) mrsl_jobid = mrsldict['JOB_ID'] if not jobid == mrsl_jobid: o.out('requestinteractivejob error! Wrong job_id specified!') o.reply_and_exit(o.ERROR) # TODO: check the status of the specified job(id) and verify it has not previously been executed. # The status must be ? (What about RETRY?)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id) client_dir = client_id_dir(client_id) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) patterns = accepted['job_id'] action = accepted['action'][-1] if not safe_handler(configuration, 'post', op_name, client_id, get_csrf_limit(configuration), accepted): output_objects.append({ 'object_type': 'error_text', 'text': '''Only accepting CSRF-filtered POST requests to prevent unintended updates''' }) return (output_objects, returnvalues.CLIENT_ERROR) if not configuration.site_enable_jobs: output_objects.append({ 'object_type': 'error_text', 'text': '''Job execution is not enabled on this system''' }) return (output_objects, returnvalues.SYSTEM_ERROR) if not action in valid_actions.keys(): output_objects.append({ 'object_type': 'error_text', 'text': 'Invalid job action "%s" (only %s supported)' % (action, ', '.join(valid_actions.keys())) }) return (output_objects, returnvalues.CLIENT_ERROR) new_state = valid_actions[action] # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep status = returnvalues.OK filelist = [] for pattern in patterns: pattern = pattern.strip() # Backward compatibility - all_jobs keyword should match all jobs if pattern == all_jobs: pattern = '*' # Check directory traversal attempts before actual handling to avoid # leaking information about file system layout while allowing # consistent error messages unfiltered_match = glob.glob(base_dir + pattern + '.mRSL') match = [] for server_path in unfiltered_match: # IMPORTANT: path must be expanded to abs for proper chrooting abs_path = os.path.abspath(server_path) if not valid_user_path(configuration, abs_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.error( '%s tried to use %s %s outside own home! (pattern %s)' % (client_id, op_name, abs_path, pattern)) continue # Insert valid job files in filelist for later treatment match.append(abs_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match if not match: output_objects.append({ 'object_type': 'error_text', 'text': '%s: You do not have any matching job IDs!' % pattern }) status = returnvalues.CLIENT_ERROR else: filelist += match # job state change is hard on the server, limit if len(filelist) > 500: output_objects.append({ 'object_type': 'error_text', 'text': 'Too many matching jobs (%s)!' % len(filelist) }) return (output_objects, returnvalues.CLIENT_ERROR) changedstatusjobs = [] for filepath in filelist: # Extract job_id from filepath (replace doesn't modify filepath) mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') changedstatusjob = { 'object_type': 'changedstatusjob', 'job_id': job_id } job_dict = unpickle(filepath, logger) if not job_dict: changedstatusjob['message'] = '''The file containing the information for job id %s could not be opened! You can only %s your own jobs!''' % (job_id, action) changedstatusjobs.append(changedstatusjob) status = returnvalues.CLIENT_ERROR continue changedstatusjob['oldstatus'] = job_dict['STATUS'] # Is the job status compatible with action? possible_cancel_states = [ 'PARSE', 'QUEUED', 'RETRY', 'EXECUTING', 'FROZEN' ] if action == 'cancel' and \ not job_dict['STATUS'] in possible_cancel_states: changedstatusjob['message'] = \ 'You can only cancel jobs with status: %s.'\ % ' or '.join(possible_cancel_states) status = returnvalues.CLIENT_ERROR changedstatusjobs.append(changedstatusjob) continue possible_freeze_states = ['QUEUED', 'RETRY'] if action == 'freeze' and \ not job_dict['STATUS'] in possible_freeze_states: changedstatusjob['message'] = \ 'You can only freeze jobs with status: %s.'\ % ' or '.join(possible_freeze_states) status = returnvalues.CLIENT_ERROR changedstatusjobs.append(changedstatusjob) continue possible_thaw_states = ['FROZEN'] if action == 'thaw' and \ not job_dict['STATUS'] in possible_thaw_states: changedstatusjob['message'] = \ 'You can only thaw jobs with status: %s.'\ % ' or '.join(possible_thaw_states) status = returnvalues.CLIENT_ERROR changedstatusjobs.append(changedstatusjob) continue # job action is handled by changing the STATUS field, notifying the # job queue and making sure the server never submits jobs with status # FROZEN or CANCELED. # file is repickled to ensure newest information is used, job_dict # might be old if another script has modified the file. if not unpickle_and_change_status(filepath, new_state, logger): output_objects.append({ 'object_type': 'error_text', 'text': 'Job status could not be changed to %s!' % new_state }) status = returnvalues.SYSTEM_ERROR # Avoid key error and make sure grid_script gets expected number of # arguments if 'UNIQUE_RESOURCE_NAME' not in job_dict: job_dict['UNIQUE_RESOURCE_NAME'] = \ 'UNIQUE_RESOURCE_NAME_NOT_FOUND' if 'EXE' not in job_dict: job_dict['EXE'] = 'EXE_NAME_NOT_FOUND' # notify queue if not send_message_to_grid_script( 'JOBACTION ' + job_id + ' ' + job_dict['STATUS'] + ' ' + new_state + ' ' + job_dict['UNIQUE_RESOURCE_NAME'] + ' ' + job_dict['EXE'] + '\n', logger, configuration): output_objects.append({ 'object_type': 'error_text', 'text': '''Error sending message to grid_script, job may still be in the job queue.''' }) status = returnvalues.SYSTEM_ERROR continue changedstatusjob['newstatus'] = new_state changedstatusjobs.append(changedstatusjob) output_objects.append({ 'object_type': 'changedstatusjobs', 'changedstatusjobs': changedstatusjobs }) return (output_objects, status)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id) client_dir = client_id_dir(client_id) status = returnvalues.OK defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) flags = accepted['flags'] patterns = accepted['job_id'] if not configuration.site_enable_jobs: output_objects.append({'object_type': 'error_text', 'text': '''Job execution is not enabled on this system'''}) return (output_objects, returnvalues.SYSTEM_ERROR) # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep mrsl_keywords_dict = get_keywords_dict(configuration) if verbose(flags): for flag in flags: output_objects.append({'object_type': 'text', 'text' : '%s using flag: %s' % (op_name, flag)}) for pattern in patterns: # Add file extension pattern += '.mRSL' # Check directory traversal attempts before actual handling to avoid # leaking information about file system layout while allowing # consistent error messages unfiltered_match = glob.glob(base_dir + pattern) match = [] for server_path in unfiltered_match: # IMPORTANT: path must be expanded to abs for proper chrooting abs_path = os.path.abspath(server_path) if not valid_user_path(configuration, abs_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning('%s tried to %s restricted path %s ! (%s)' % (client_id, op_name, abs_path, pattern)) continue match.append(abs_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match if not match: output_objects.append({'object_type': 'file_not_found', 'name' : pattern}) status = returnvalues.FILE_NOT_FOUND for abs_path in match: output_lines = [] relative_path = abs_path.replace(base_dir, '') try: mrsl_dict = unpickle(abs_path, logger) if not mrsl_dict: raise Exception('could not load job mRSL') for (key, val) in mrsl_dict.items(): if not key in mrsl_keywords_dict.keys(): continue if not val: continue output_lines.append('::%s::\n' % key) if 'multiplestrings' == mrsl_keywords_dict[key]['Type']: for line in val: output_lines.append('%s\n' % line) elif 'multiplekeyvalues' == mrsl_keywords_dict[key]['Type']: for (left, right) in val: output_lines.append('%s=%s\n' % (left, right)) else: output_lines.append('%s\n' % val) output_lines.append('\n') except Exception as exc: output_objects.append({'object_type': 'error_text', 'text' : "%s: '%s': %s" % (op_name, relative_path, exc)}) logger.error("%s: failed on '%s': %s" % (op_name, relative_path, exc)) status = returnvalues.SYSTEM_ERROR continue if verbose(flags): output_objects.append({'object_type': 'file_output', 'path' : relative_path, 'lines' : output_lines}) else: output_objects.append({'object_type': 'file_output', 'lines' : output_lines}) return (output_objects, status)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id, op_header=False) client_dir = client_id_dir(client_id) status = returnvalues.OK defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: logger.error("jobstatus input validation failed: %s" % accepted) return (accepted, returnvalues.CLIENT_ERROR) flags = ''.join(accepted['flags']) max_jobs = int(accepted['max_jobs'][-1]) order = 'unsorted ' if sorted(flags): order = 'sorted ' patterns = accepted['job_id'] project_names = accepted['project_name'] if len(project_names) > 0: for project_name in project_names: project_name_job_ids = \ get_job_ids_with_specified_project_name(client_id, project_name, configuration.mrsl_files_dir, logger) patterns.extend(project_name_job_ids) if not configuration.site_enable_jobs: output_objects.append({'object_type': 'error_text', 'text': '''Job execution is not enabled on this system'''}) return (output_objects, returnvalues.SYSTEM_ERROR) # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep output_objects.append({'object_type': 'header', 'text': '%s %s job status' % (configuration.short_title, order)}) if not patterns: output_objects.append( {'object_type': 'error_text', 'text': 'No job_id specified!'}) return (output_objects, returnvalues.NO_SUCH_JOB_ID) if verbose(flags): for flag in flags: output_objects.append({'object_type': 'text', 'text': '%s using flag: %s' % (op_name, flag)}) if not os.path.isdir(base_dir): output_objects.append( {'object_type': 'error_text', 'text': ('You have not been created as a user on the %s server! ' 'Please contact the %s team.') % (configuration.short_title, configuration.short_title)}) return (output_objects, returnvalues.CLIENT_ERROR) filelist = [] for pattern in patterns: pattern = pattern.strip() # Backward compatibility - all_jobs keyword should match all jobs if pattern == all_jobs: pattern = '*' # Check directory traversal attempts before actual handling to # avoid leaking information about file system layout while # allowing consistent error messages unfiltered_match = glob.glob(base_dir + pattern + '.mRSL') match = [] for server_path in unfiltered_match: # IMPORTANT: path must be expanded to abs for proper chrooting abs_path = os.path.abspath(server_path) if not valid_user_path(configuration, abs_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning('%s tried to %s restricted path %s ! (%s)' % (client_id, op_name, abs_path, pattern)) continue # Insert valid job files in filelist for later treatment match.append(abs_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match.... if not match: output_objects.append( {'object_type': 'error_text', 'text': '%s: You do not have any matching job IDs!' % pattern}) status = returnvalues.CLIENT_ERROR else: filelist += match if sorted(flags): sort(filelist) if max_jobs > 0 and max_jobs < len(filelist): output_objects.append( {'object_type': 'text', 'text': 'Only showing first %d of the %d matching jobs as requested' % (max_jobs, len(filelist))}) filelist = filelist[:max_jobs] # Iterate through jobs and list details for each job_list = {'object_type': 'job_list', 'jobs': []} for filepath in filelist: # Extract job_id from filepath (replace doesn't modify filepath) mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') job_dict = unpickle(filepath, logger) if not job_dict: status = returnvalues.CLIENT_ERROR output_objects.append( {'object_type': 'error_text', 'text': 'No such job: %s (could not load mRSL file %s)' % (job_id, filepath)}) continue # Expand any job variables before use job_dict = expand_variables(job_dict) job_obj = {'object_type': 'job', 'job_id': job_id} job_obj['status'] = job_dict['STATUS'] time_fields = [ 'VERIFIED', 'VERIFIED_TIMESTAMP', 'RECEIVED_TIMESTAMP', 'QUEUED_TIMESTAMP', 'SCHEDULE_TIMESTAMP', 'EXECUTING_TIMESTAMP', 'FINISHED_TIMESTAMP', 'FAILED_TIMESTAMP', 'CANCELED_TIMESTAMP', ] for name in time_fields: if name in job_dict: # time objects cannot be marshalled, asctime if timestamp try: job_obj[name.lower()] = time.asctime(job_dict[name]) except Exception as exc: # not a time object, just add job_obj[name.lower()] = job_dict[name] ########################################### # ARC job status retrieval on demand: # But we should _not_ update the status in the mRSL files, since # other MiG code might rely on finding only valid "MiG" states. if configuration.arc_clusters and \ job_dict.get('UNIQUE_RESOURCE_NAME', 'unset') == 'ARC' \ and job_dict['STATUS'] == 'EXECUTING': try: home = os.path.join(configuration.user_home, client_dir) arcsession = arcwrapper.Ui(home) arcstatus = arcsession.jobStatus(job_dict['EXE']) job_obj['status'] = arcstatus['status'] except arcwrapper.ARCWrapperError as err: logger.error('Error retrieving ARC job status: %s' % err.what()) job_obj['status'] += '(Error: ' + err.what() + ')' except arcwrapper.NoProxyError as err: logger.error('While retrieving ARC job status: %s' % err.what()) job_obj['status'] += '(Error: ' + err.what() + ')' except Exception as err: logger.error('Error retrieving ARC job status: %s' % err) job_obj['status'] += '(Error during retrieval)' exec_histories = [] if verbose(flags): if 'EXECUTE' in job_dict: command_line = '; '.join(job_dict['EXECUTE']) if len(command_line) > 256: job_obj['execute'] = '%s ...' % command_line[:252] else: job_obj['execute'] = command_line res_conf = job_dict.get('RESOURCE_CONFIG', {}) if 'RESOURCE_ID' in res_conf: public_id = res_conf['RESOURCE_ID'] if res_conf.get('ANONYMOUS', True): public_id = anon_resource_id(public_id) job_obj['resource'] = public_id if job_dict.get('PUBLICNAME', False): job_obj['resource'] += ' (alias %(PUBLICNAME)s)' % job_dict if 'RESOURCE_VGRID' in job_dict: job_obj['vgrid'] = job_dict['RESOURCE_VGRID'] if 'EXECUTION_HISTORY' in job_dict: counter = 0 for history_dict in job_dict['EXECUTION_HISTORY']: exec_history = \ {'object_type': 'execution_history'} if 'QUEUED_TIMESTAMP' in history_dict: exec_history['queued'] = \ time.asctime(history_dict['QUEUED_TIMESTAMP' ]) if 'EXECUTING_TIMESTAMP' in history_dict: exec_history['executing'] = \ time.asctime(history_dict['EXECUTING_TIMESTAMP' ]) if 'PUBLICNAME' in history_dict: if history_dict['PUBLICNAME']: exec_history['resource'] = \ history_dict['PUBLICNAME'] else: exec_history['resource'] = 'HIDDEN' if 'RESOURCE_VGRID' in history_dict: exec_history['vgrid'] = \ history_dict['RESOURCE_VGRID'] if 'FAILED_TIMESTAMP' in history_dict: exec_history['failed'] = \ time.asctime(history_dict['FAILED_TIMESTAMP' ]) if 'FAILED_MESSAGE' in history_dict: exec_history['failed_message'] = \ history_dict['FAILED_MESSAGE'] exec_histories.append( {'execution_history': exec_history, 'count': counter}) counter += 1 if 'SCHEDULE_HINT' in job_dict: job_obj['schedule_hint'] = job_dict['SCHEDULE_HINT'] # We should not show raw schedule_targets due to lack of anonymization if 'SCHEDULE_TARGETS' in job_dict: job_obj['schedule_hits'] = len(job_dict['SCHEDULE_TARGETS']) if 'EXPECTED_DELAY' in job_dict: # Catch None value if not job_dict['EXPECTED_DELAY']: job_obj['expected_delay'] = 0 else: job_obj['expected_delay'] = int(job_dict['EXPECTED_DELAY']) job_obj['execution_histories'] = exec_histories if interactive(flags): job_obj['statuslink'] = {'object_type': 'link', 'destination': 'fileman.py?path=%s/%s/' % (job_output_dir, job_id), 'text': 'View status files'} job_obj['mrsllink'] = {'object_type': 'link', 'destination': 'mrslview.py?job_id=%s' % job_id, 'text': 'View parsed mRSL contents'} if 'OUTPUTFILES' in job_dict and job_dict['OUTPUTFILES']: # Create a single ls link with all supplied outputfiles path_string = '' for path in job_dict['OUTPUTFILES']: # OUTPUTFILES is either just combo path or src dst paths parts = path.split() # Always take last part as destination path_string += 'path=%s;' % parts[-1] job_obj['outputfileslink'] = {'object_type': 'link', 'destination': 'ls.py?%s' % path_string, 'text': 'View output files'} form_method = 'post' csrf_limit = get_csrf_limit(configuration) target_op = 'resubmit' csrf_token = make_csrf_token(configuration, form_method, target_op, client_id, csrf_limit) js_name = 'resubmit%s' % hexlify(job_id) helper = html_post_helper(js_name, '%s.py' % target_op, {'job_id': job_id, csrf_field: csrf_token}) output_objects.append({'object_type': 'html_form', 'text': helper}) job_obj['resubmitlink'] = {'object_type': 'link', 'destination': "javascript: %s();" % js_name, 'text': 'Resubmit job'} target_op = 'jobaction' csrf_token = make_csrf_token(configuration, form_method, target_op, client_id, csrf_limit) js_name = 'freeze%s' % hexlify(job_id) helper = html_post_helper(js_name, '%s.py' % target_op, {'action': 'freeze', 'job_id': job_id, csrf_field: csrf_token}) output_objects.append({'object_type': 'html_form', 'text': helper}) job_obj['freezelink'] = {'object_type': 'link', 'destination': "javascript: %s();" % js_name, 'text': 'Freeze job in queue'} js_name = 'thaw%s' % hexlify(job_id) helper = html_post_helper(js_name, '%s.py' % target_op, {'action': 'thaw', 'job_id': job_id, csrf_field: csrf_token}) output_objects.append({'object_type': 'html_form', 'text': helper}) job_obj['thawlink'] = {'object_type': 'link', 'destination': "javascript: %s();" % js_name, 'text': 'Thaw job in queue'} js_name = 'cancel%s' % hexlify(job_id) helper = html_post_helper(js_name, '%s.py' % target_op, {'action': 'cancel', 'job_id': job_id, csrf_field: csrf_token}) output_objects.append({'object_type': 'html_form', 'text': helper}) job_obj['cancellink'] = {'object_type': 'link', 'destination': "javascript: %s();" % js_name, 'text': 'Cancel job'} target_op = 'jobschedule' csrf_token = make_csrf_token(configuration, form_method, target_op, client_id, csrf_limit) js_name = 'jobschedule%s' % hexlify(job_id) helper = html_post_helper(js_name, '%s.py' % target_op, {'job_id': job_id, csrf_field: csrf_token}) output_objects.append({'object_type': 'html_form', 'text': helper}) job_obj['jobschedulelink'] = {'object_type': 'link', 'destination': "javascript: %s();" % js_name, 'text': 'Request schedule information'} target_op = 'jobfeasible' csrf_token = make_csrf_token(configuration, form_method, target_op, client_id, csrf_limit) js_name = 'jobfeasible%s' % hexlify(job_id) helper = html_post_helper(js_name, '%s.py' % target_op, {'job_id': job_id, csrf_field: csrf_token}) output_objects.append({'object_type': 'html_form', 'text': helper}) job_obj['jobfeasiblelink'] = {'object_type': 'link', 'destination': "javascript: %s();" % js_name, 'text': 'Check job feasibility'} job_obj['liveiolink'] = {'object_type': 'link', 'destination': 'liveio.py?job_id=%s' % job_id, 'text': 'Request live I/O'} job_list['jobs'].append(job_obj) output_objects.append(job_list) return (output_objects, status)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id, op_header=False) client_dir = client_id_dir(client_id) defaults = signature()[1] status = returnvalues.OK (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, # NOTE: src can use wildcards, dst cannot typecheck_overrides={'src': valid_path_pattern}, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) job_ids = accepted['job_id'] action = accepted['action'][-1] src = accepted['src'] dst = accepted['dst'][-1] title_entry = find_entry(output_objects, 'title') title_entry['text'] = '%s live I/O' % configuration.short_title add_import, add_init, add_ready = '', '', '' add_init += ''' var fields = 1; var max_fields = 20; var src_input = "<input class='fillwidth' type=text name=src value='' /><br />"; function addSource() { if (fields < max_fields) { $("#srcfields").append(src_input); fields += 1; } else { alert("Maximum " + max_fields + " source fields allowed!"); } } ''' title_entry['script']['advanced'] += add_import title_entry['script']['init'] += add_init title_entry['script']['ready'] += add_ready output_objects.append({ 'object_type': 'header', 'text': 'Request live communication with jobs' }) if not configuration.site_enable_jobs: output_objects.append({ 'object_type': 'error_text', 'text': '''Job execution is not enabled on this system''' }) return (output_objects, returnvalues.SYSTEM_ERROR) if not action in valid_actions: output_objects.append({ 'object_type': 'error_text', 'text': 'Invalid action "%s" (supported: %s)' % (action, ', '.join(valid_actions)) }) return (output_objects, returnvalues.CLIENT_ERROR) if action in post_actions: if not safe_handler(configuration, 'post', op_name, client_id, get_csrf_limit(configuration), accepted): output_objects.append({ 'object_type': 'error_text', 'text': '''Only accepting CSRF-filtered POST requests to prevent unintended updates''' }) return (output_objects, returnvalues.CLIENT_ERROR) if not job_ids or action in interactive_actions: job_id = '' if job_ids: job_id = job_ids[-1] form_method = 'post' csrf_limit = get_csrf_limit(configuration) fill_helpers = { 'job_id': job_id, 'form_method': form_method, 'csrf_field': csrf_field, 'csrf_limit': csrf_limit } target_op = 'liveio' csrf_token = make_csrf_token(configuration, form_method, target_op, client_id, csrf_limit) fill_helpers.update({'target_op': target_op, 'csrf_token': csrf_token}) output_objects.append({ 'object_type': 'text', 'text': ''' Fill in the live I/O details below to request communication with a running job. Job ID can be a full ID or a wild card pattern using "*" and "?" to match one or more of your job IDs. Use send output without source and destination paths to request upload of the default stdio files from the job on the resource to the associated job_output directory in your MiG home. Destination is a always handled as a directory path to put source files into. Source and destination paths are always taken relative to the job execution directory on the resource and your MiG home respectively. ''' }) html = ''' <table class="liveio"> <tr> <td> <form method="%(form_method)s" action="%(target_op)s.py"> <table class="liveio"> <tr><td class=centertext> </td></tr> <tr><td> Action:<br /> <input type="hidden" name="%(csrf_field)s" value="%(csrf_token)s" /> <input type=radio name=action checked value="send" />send output <input type=radio name=action value="get" />get input </td></tr> <tr><td> Job ID:<br /> <input class="fillwidth" type=text name=job_id value="%(job_id)s" /> </td></tr> <tr><td> Source path(s):<br /> <div id="srcfields"> <input class="fillwidth" type=text name=src value="" /><br /> </div> <input id="addsrcbutton" type="button" onclick="addSource(); return false;" value="Add another source field" /> </td></tr> <tr><td> Destination path:<br /> <input class="fillwidth" type=text name=dst value="" /> </td></tr> <tr><td> <input type="submit" value="Send request" /> </td></tr> </table> </form> </td> </tr> </table> ''' % fill_helpers output_objects.append({'object_type': 'html_form', 'text': html}) output_objects.append({ 'object_type': 'text', 'text': ''' Further live job control is avalable through your personal message queues. They provide a basic interface for centrally storing messages under your grid account and can be used to pass messages between jobs or for orchestrating jobs before and during execution. ''' }) output_objects.append({ 'object_type': 'link', 'destination': 'mqueue.py', 'text': 'Message queue interface' }) return (output_objects, returnvalues.OK) elif action in ['get', 'receive', 'input']: action = 'get' action_desc = 'will be downloaded to the job on the resource' elif action in ['put', 'send', 'output']: action = 'send' action_desc = 'will be uploaded from the job on the resource' else: output_objects.append({ 'object_type': 'error_text', 'text': 'Invalid live io action: %s' % action }) return (output_objects, returnvalues.CLIENT_ERROR) output_objects.append({ 'object_type': 'text', 'text': 'Requesting live I/O for %s' % ', '.join(job_ids) }) if action == 'get' and (not src or not dst): output_objects.append({ 'object_type': 'error_text', 'text': 'src and dst parameters required for live input' }) return (output_objects, returnvalues.CLIENT_ERROR) # Automatic fall back to stdio files if output with no path provided if src: src_text = 'The files ' + ' '.join(src) else: src_text = 'The job stdio files' if dst: dst_text = 'the ' + dst + ' directory' else: dst_text = 'the corresponding job_output directory' # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep filelist = [] for job_id in job_ids: job_id = job_id.strip() # is job currently being executed? # Backward compatibility - all_jobs keyword should match all jobs if job_id == all_jobs: job_id = '*' # Check directory traversal attempts before actual handling to avoid # leaking information about file system layout while allowing # consistent error messages unfiltered_match = glob.glob(base_dir + job_id + '.mRSL') match = [] for server_path in unfiltered_match: # IMPORTANT: path must be expanded to abs for proper chrooting abs_path = os.path.abspath(server_path) if not valid_user_path(configuration, abs_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning("%s tried to %s restricted path %s ! (%s)" % (client_id, op_name, abs_path, job_id)) continue # Insert valid job files in filelist for later treatment match.append(abs_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match.... if not match: output_objects.append({ 'object_type': 'error_text', 'text': '%s: You do not have any matching job IDs!' % job_id }) else: filelist += match for filepath in filelist: # Extract job_id from filepath (replace doesn't modify filepath) mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') job_dict = unpickle(filepath, logger) if not job_dict: status = returnvalues.CLIENT_ERROR output_objects.append({ 'object_type': 'error_text', 'text': ('You can only list status of your own jobs. ' 'Please verify that you submitted the mRSL file ' 'with job id "%s" (Could not unpickle mRSL file %s)') % (job_id, filepath) }) continue if job_dict['STATUS'] != 'EXECUTING': output_objects.append({ 'object_type': 'text', 'text': 'Job %s is not currently being executed! Job status: %s' % (job_id, job_dict['STATUS']) }) continue if job_dict['UNIQUE_RESOURCE_NAME'] == 'ARC': output_objects.append({ 'object_type': 'text', 'text': 'Job %s is submitted to ARC, details are not available!' % job_id }) continue last_live_update_dict = {} last_live_update_file = configuration.mig_system_files + os.sep\ + job_id + '.last_live_update' if os.path.isfile(last_live_update_file): last_live_update_dict_unpickled = \ unpickle(last_live_update_file, logger) if not last_live_update_dict_unpickled: output_objects.append({ 'object_type': 'error_text', 'text': 'Could not unpickle %s - skipping request!' % last_live_update_file }) continue if 'LAST_LIVE_UPDATE_REQUEST_TIMESTAMP' not in last_live_update_dict_unpickled: output_objects.append({ 'object_type': 'error_text', 'text': 'Could not find needed key in %s.' % last_live_update_file }) continue last_live_update_request = \ last_live_update_dict_unpickled['LAST_LIVE_UPDATE_REQUEST_TIMESTAMP' ] difference = datetime.datetime.now() - last_live_update_request try: min_delay = \ int(configuration.min_seconds_between_live_update_requests) except: min_delay = 30 if difference.seconds < min_delay: output_objects.append({ 'object_type': 'error_text', 'text': ('Request not allowed, you must wait at least ' '%s seconds between live update requests!') % min_delay }) continue # save this request to file to avoid DoS from a client request loop. last_live_update_dict['LAST_LIVE_UPDATE_REQUEST_TIMESTAMP'] = \ datetime.datetime.now() pickle_ret = pickle(last_live_update_dict, last_live_update_file, logger) if not pickle_ret: output_objects.append({ 'object_type': 'error_text', 'text': 'Error saving live io request timestamp to last_live_update ' 'file, request not sent!' }) continue # # # ## job is being executed right now, send live io request to frontend # # # get resource_config, needed by scp_file_to_resource # (res_status, resource_config) = get_resource_configuration( # resource_home, unique_resource_name, logger) resource_config = job_dict['RESOURCE_CONFIG'] (res_status, exe) = get_resource_exe(resource_config, job_dict['EXE'], logger) if not res_status: output_objects.append({ 'object_type': 'error_text', 'text': 'Could not get exe configuration for job %s' % job_id }) continue local_file = '%s.%supdate' % (job_dict['LOCALJOBNAME'], action) if not os.path.exists(local_file): # create try: filehandle = open(local_file, 'w') filehandle.write('job_id ' + job_dict['JOB_ID'] + '\n') filehandle.write('localjobname ' + job_dict['LOCALJOBNAME'] + '\n') filehandle.write('execution_user ' + exe['execution_user'] + '\n') filehandle.write('execution_node ' + exe['execution_node'] + '\n') filehandle.write('execution_dir ' + exe['execution_dir'] + '\n') filehandle.write('target liveio\n') # Leave defaults src and dst to FE script if not provided if src: filehandle.write('source ' + ' '.join(src) + '\n') if dst: filehandle.write('destination ' + dst + '\n') # Backward compatible test for shared_fs - fall back to scp if 'shared_fs' in exe and exe['shared_fs']: filehandle.write('copy_command cp\n') filehandle.write('copy_frontend_prefix \n') filehandle.write('copy_execution_prefix \n') else: filehandle.write('copy_command scp -B\n') filehandle.write( 'copy_frontend_prefix ${frontend_user}@${frontend_node}:\n' ) filehandle.write( 'copy_execution_prefix ${execution_user}@${execution_node}:\n' ) filehandle.write('### END OF SCRIPT ###\n') filehandle.close() except Exception as exc: pass if not os.path.exists(local_file): output_objects.append({ 'object_type': 'error_text', 'text': '.%supdate file not available on %s server' % (action, configuration.short_title) }) continue scp_status = copy_file_to_resource( local_file, '%s.%supdate' % (job_dict['LOCALJOBNAME'], action), resource_config, logger) if not scp_status: output_objects.append({ 'object_type': 'error_text', 'text': 'Error sending request for live io to resource!' }) continue else: output_objects.append({ 'object_type': 'text', 'text': 'Request for live io was successfully sent to the resource!' }) output_objects.append({ 'object_type': 'text', 'text': '%s %s and should become available in %s in a minute.' % (src_text, action_desc, dst_text) }) if action == 'send': if not dst: target_path = '%s/%s/*' % (job_output_dir, job_id) else: target_path = dst enc_url = 'ls.py?path=%s' % quote(target_path) output_objects.append({ 'object_type': 'link', 'destination': enc_url, 'text': 'View uploaded files' }) else: enc_url = 'ls.py?path=' enc_url += ';path='.join([quote(i) for i in src]) output_objects.append({ 'object_type': 'link', 'destination': enc_url, 'text': 'View files for download' }) try: os.remove(local_file) except Exception as exc: pass return (output_objects, returnvalues.OK)