def get_job_ids_with_specified_project_name( client_id, project_name, mrsl_files_dir, logger, ): """Helper for finding a job with a given project field""" client_dir = client_id_dir(client_id) # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = os.path.abspath(os.path.join(mrsl_files_dir, client_dir)) \ + os.sep # this is heavy :-/ we must loop all the mrsl files submitted by the user # to find the job ids belonging to the specified project matching_job_ids = [] all_files = os.listdir(base_dir) for mrsl_file in all_files: job_dict = unpickle(base_dir + os.sep + mrsl_file, logger) if not job_dict: continue if job_dict.has_key('PROJECT'): if job_dict['PROJECT'] == project_name: matching_job_ids.append(job_dict['JOB_ID']) return matching_job_ids
def update_section_helper(client_id, configuration, section_filename, changes, defaults, create_missing=True): """Update settings section in pickled file with values from changes dictionary. Optional create_missing can be used if the pickle should be created if not already there. The defaults dictionary is used to set any missing values. """ client_dir = client_id_dir(client_id) section_path = os.path.join(configuration.user_settings, client_dir, section_filename) if not os.path.exists(section_path): if create_missing: section_dict = {} else: raise Exception('no %s file to update!' % section_filename) else: section_dict = unpickle(section_path, configuration.logger) for (key, val) in defaults.items(): section_dict[key] = section_dict.get(key, val) section_dict.update(changes) if not pickle(section_dict, section_path, configuration.logger): raise Exception('could not save updated %s file!' % section_filename) return section_dict
def list_items_in_pickled_list(path, logger, allow_missing=False): # list items _list = unpickle(path, logger, allow_missing) if _list is False: return (False, 'Failure: could not unpickle list') return (True, _list)
def recv_notification(configuration, path): """Read notification event from file""" logger = configuration.logger # logger.debug("read_notification: %s" % file) status = True new_notification = unpickle(path, logger) if not new_notification: logger.error("Failed to unpickle: %s" % path) return False user_id = new_notification.get('user_id', '') # logger.debug("Received user_id: '%s'" % user_id) if not user_id: status = False logger.error("Missing user_id in notification: %s" % path) else: client_id = expand_openid_alias(user_id, configuration) # logger.debug("resolved client_id: '%s'" % client_id) if not client_id or not extract_field(client_id, 'email'): status = False logger.error("Failed to resolve client_id from user_id: '%s'" % user_id) if status: category = new_notification.get('category', []) # logger.debug("Received category: %s" % category) if not isinstance(category, list): status = False logger.error("Received category: %s must be a list" % category) if status: logger.info("Received event: %s, from: '%s'" % (category, client_id)) new_timestamp = new_notification.get('timestamp') message = new_notification.get('message', '') # logger.debug("Received message: %s" % message) client_dict = received_notifications.get(client_id, {}) if not client_dict: received_notifications[client_id] = client_dict files_list = client_dict.get('files', []) if not files_list: client_dict['files'] = files_list if path in files_list: logger.warning("Skipping prevoursly received notification: '%s'" % path) else: files_list.append(path) client_dict['timestamp'] = min( client_dict.get('timestamp', sys.maxint), new_timestamp) messages_dict = client_dict.get('messages', {}) if not messages_dict: client_dict['messages'] = messages_dict header = " ".join(category) if not header: header = '* UNKNOWN *' body_dict = messages_dict.get(header, {}) if not body_dict: messages_dict[header] = body_dict message_count = body_dict.get(message, 0) body_dict[message] = message_count + 1 return status
def list_items_in_pickled_list(path, logger): # list items list_ = unpickle(path, logger) if list_ == []: pass elif not list_: return (False, 'Failure: could not unpickle list') return (True, list_)
def fill_triggers(configuration, vgrids_dict): """Search for system_imagesettings triggers and the needed information, such as rule_id, run_as and path to *vgrids_dict*""" status = True logger = configuration.logger logger.info(str(vgrids_dict.keys())) for key in vgrids_dict: logger.info('----------------------------------------------') logger.info('%s' % key) logger.info('----------------------------------------------') vgrid = vgrids_dict[key]['vgrid'] vgridpath = vgrids_dict[key]['vgridpath'] trigger_file = \ os.path.join(os.path.join(configuration.vgrid_home, vgrid), configuration.vgrid_triggers) if not os.path.exists(trigger_file): logger.warning("Missing trigger configuration: '%s'" % trigger_file) else: triggers = unpickle(trigger_file, logger) if not isinstance(triggers, list): status = False logger.error("Couldn't load trigger configuration: '%s'" % trigger_file) break for trigger in triggers: if trigger['rule_id'].startswith('system_imagesettings_'): vgridtriggerpath = get_vgridtriggerpath( vgrid, trigger['path']) if trigger['rule_id'] \ == 'system_imagesettings_meta_created' \ or trigger['rule_id'] \ == 'system_imagesettings_dir_deleted' \ or vgridtriggerpath == vgridpath: logger.info("vgrid: '%s'" % vgrid) logger.info("path: '%s'" % vgridpath) logger.info("rule_id: '%s'" % trigger['rule_id']) logger.info("run_as '%s'" % trigger['run_as']) logger.info( '----------------------------------------------') trigger = { 'rule_id': trigger['rule_id'], 'run_as': trigger['run_as'], 'path': vgridpath } vgrids_dict[key]['triggers'].append(trigger) return status
def load_queue(path, logger): """Load job queue from path""" # Load and add current logger queue = io.unpickle(path, logger) if not queue: # unpickle not successful return None else: queue.logger = logger return queue
def get_resource_configuration(resource_home, unique_resource_name, logger): # open the configuration file resource_config_file = resource_home + "/" + unique_resource_name + "/config" resource_config = unpickle(resource_config_file, logger) if not resource_config: msg = "could not unpickle %s" % resource_config_file logger.error(msg) return (False, msg) else: return (True, resource_config)
def is_item_in_pickled_list(path, item, logger): list_ = unpickle(path, logger) if not list_: return False if len(list_) == 0: return False if item in list_: return True else: return False
def main(): configuration = get_configuration_object() # Overwrite default logger logger = configuration.logger = get_logger(logging.INFO) logger = configuration.logger = get_logger(logging.INFO) vgrids_dict = unpickle(TRIGGER_DICT_FILE, logger) vgrid_list = get_vgrids_dict(vgrids_dict) for name in vgrid_list: print name
def main(): status = True configuration = get_configuration_object() logger = configuration.logger = get_logger(logging.INFO) # Overwrite default logger argc = len(sys.argv) - 1 user_vgrid_list = None if argc == 1: user_vgrid_list = [vgrid.strip() for vgrid in sys.argv[1].split(',')] logger.info('Using custom vgrid_list: %s' % user_vgrid_list) vgrids_dict = unpickle(TRIGGER_DICT_FILE, logger) update_trigger_dict = None if vgrids_dict: (vgrids_dict, vgrid_list) = filter_vgrids_dict(configuration, vgrids_dict, user_vgrid_list) else: status = False logger.error("Missing vgrid dict file: '%s'" % TRIGGER_DICT_FILE) if status: status = backup_trigger_files(configuration, vgrid_list) if status: status = backup_imagesettings_files(configuration, vgrid_list) if status: status = backup_paraview_links(configuration, vgrid_list) if status: update_trigger_dict = \ get_update_trigger_dict_and_check_for_unique_clientid(configuration, vgrids_dict) if update_trigger_dict is None: status = False if status: status = remove_triggers(configuration, vgrids_dict) if status: status = update_backend(configuration, update_trigger_dict) if status: return 0 else: return 1
def get_resource_configuration(resource_home, unique_resource_name, logger): """Load a resource configuration from file""" # open the configuration file resource_config_file = resource_home + '/' + unique_resource_name\ + '/config' resource_config = unpickle(resource_config_file, logger) if not resource_config: msg = 'could not unpickle %s' % resource_config_file logger.error(msg) return (False, msg) else: return (True, resource_config)
def get_dict_from_display_number(display_number, configuration, logger): (init_ret, filename) = \ initialize_and_get_display_dict_filename(configuration, logger) if not init_ret: return (False, 'could not initialize') dict = unpickle(filename, logger) if dict == False: print 'dict is %s false' % dict return (False, 'could not unpickle %s' % filename) if dict.has_key(display_number): return (display_number, dict[display_number]) else: return (True, -1)
def load_section_helper(client_id, configuration, section_filename, section_keys, include_meta=False): """Load settings section from pickled file. Optional include_meta controls the inclusion of meta data like creator and creation time. """ client_dir = client_id_dir(client_id) section_path = os.path.join(configuration.user_settings, client_dir, section_filename) section_dict = unpickle(section_path, configuration.logger) if section_dict and not include_meta: real_keys = section_keys for key in section_dict.keys(): if not key in real_keys: del section_dict[key] return section_dict
def fill_from_mrsl(self, job_data): """Read a pickled mRSL file and fill in the contained data""" # jobData supposed to be a pickled file or a dictionary. # add a solid type later! self.__logger.debug('filling in job data from file %s' % job_data) try: if os.path.exists(job_data): job = unpickle(job_data, self.__logger) self.fill_from_dict(job) else: self.__logger.error('file %s does not exist.' % job_data) except Exception, err: self.__logger.error('while filling in data from %s: %s' % (job_data, err))
def testresource_has_re_specified(unique_resource_name, re_name, configuration): """Check if unique_resource_name has runtime env enabled""" resource_config = unpickle(configuration.resource_home + unique_resource_name + '/config', configuration.logger) if not resource_config: configuration.logger.error('error unpickling resource config') return False for rre in resource_config['RUNTIMEENVIRONMENT']: (res_name, res_val) = rre if re_name == res_name: return True return False
def testresource_has_re_specified(unique_resource_name, re_name, configuration): """Check if unique_resource_name has runtime env enabled""" resource_config = unpickle( configuration.resource_home + unique_resource_name + '/config', configuration.logger) if not resource_config: configuration.logger.error('error unpickling resource config') return False for rre in resource_config['RUNTIMEENVIRONMENT']: (res_name, res_val) = rre if re_name == res_name: return True return False
def remove_item_from_pickled_list( path, item, logger, allow_empty_list=True, ): list_ = unpickle(path, logger) output = '' if list_ == []: # OK, if the list is empty pass elif not list_: output += 'Failure: could not unpickle current list' return (False, output) # Check if the item is in the list item = item.strip() if not item in list_: output += '%s not found in list' % item return (False, output) if not allow_empty_list: if len(list_) <= 1: output += 'You cannot remove the last item' return (False, output) # ok, lets remove the item and pickle and save the new list try: list_.remove(item) except: output += \ 'Strange error, %s could not be removed, but it seems to be in the list'\ % item return (False, output) status = pickle(list_, path, logger) if not status: output += 'Error pickling new owners file' return (False, output) return (True, output)
def set_user_display_inactive( client_id, display_number, configuration, logger, ): (init_ret, filename) = \ initialize_and_get_display_dict_filename(configuration, logger) if not init_ret: return (False, 'could not initialize') current_display = get_users_display_number(client_id, configuration, logger) if not current_display: return ( False, 'could not remove active display since no entry was found for %s' % client_id) if current_display == -1: return ( False, 'user %s does not have a display registered, unable to inactivate any display' % client_id) if current_display != display_number: return ( False, 'user %s had display %s registered in dict, but specified display_number in set_user_display_inactive was %s' % (client_id, current_display, display_number)) # remove entry from dict and pickle it dict = unpickle(filename, logger) if dict == False: return (False, 'could not unpickle %s' % filename) if not dict.has_key(display_number): return (False, 'display %s not found in dict' % display_number) try: del dict[display_number] except Exception, e: return ( False, 'exception trying to remove %s from display dict. Exception %s' % (display_number, e))
def get_users_display_dict(client_id, configuration, logger): (init_ret, filename) = \ initialize_and_get_display_dict_filename(configuration, logger) if not init_ret: return (False, 'could not initialize') dict = unpickle(filename, logger) if dict == False: return (False, 'could not unpickle %s' % filename) for (key, value) in dict.items(): if value['client_id'] == client_id: return (key, value) # not found, client_id does not have a live display return (-1, -1)
def set_user_display_inactive( client_id, display_number, configuration, logger, ): (init_ret, filename) = \ initialize_and_get_display_dict_filename(configuration, logger) if not init_ret: return (False, 'could not initialize') current_display = get_users_display_number(client_id, configuration, logger) if not current_display: return (False, 'could not remove active display since no entry was found for %s' % client_id) if current_display == -1: return (False, 'user %s does not have a display registered, unable to inactivate any display' % client_id) if current_display != display_number: return (False, 'user %s had display %s registered in dict, but specified display_number in set_user_display_inactive was %s' % (client_id, current_display, display_number)) # remove entry from dict and pickle it dict = unpickle(filename, logger) if dict == False: return (False, 'could not unpickle %s' % filename) if not dict.has_key(display_number): return (False, 'display %s not found in dict' % display_number) try: del dict[display_number] except Exception, e: return (False, 'exception trying to remove %s from display dict. Exception %s' % (display_number, e))
def _load_rate_limits(configuration, proto, do_lock=True): """Load rate limits dict""" logger = configuration.logger rate_limits_filepath = os.path.join( configuration.mig_system_run, "%s.%s" % (proto, _rate_limits_filename)) if do_lock: rate_limits_lock = _acquire_rate_limits_lock(configuration, proto, exclusive=False) result = unpickle(rate_limits_filepath, logger) if do_lock: _release_rate_limits_lock(rate_limits_lock) if not isinstance(result, dict): logger.warning("failed to retrieve active %s rate limits from %s" % (proto, rate_limits_filepath)) result = {} return result
def migrated_job(filename, client_id, configuration): """returns a tuple (bool status, str msg)""" logger = configuration.logger client_dir = client_id_dir(client_id) job_path = os.path.abspath( os.path.join(configuration.server_home, client_dir, filename)) # unpickle and enqueue received job file job_path_spaces = job_path.replace('\\ ', '\\\\\\ ') job = io.unpickle(job_path_spaces, configuration.logger) # TODO: update any fields to mark migration? if not job: return (False, 'Fatal migration error: loading pickled job (%s) failed! ' % \ job_path_spaces) job_id = job['JOB_ID'] # save file with other mRSL files mrsl_filename = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir, job_id + '.mRSL')) if not io.pickle(job, mrsl_filename, configuration.logger): return (False, 'Fatal error: Could not write ' + filename) # tell 'grid_script' message = 'SERVERJOBFILE ' + client_dir + '/' + job_id + '\n' if not io.send_message_to_grid_script(message, logger, configuration): return (False, 'Fatal error: Could not write to grid stdin') # TODO: do we need to wait for grid_script to ack job reception? # ... same question applies to new_job, btw. return (True, '%s succesfully migrated.' % job_id)
def _load_sessions(configuration, proto, do_lock=True): """Load sessions dict""" logger = configuration.logger sessions_filepath = os.path.join(configuration.mig_system_run, "%s.%s" % (proto, _sessions_filename)) sessions_lock = None if do_lock: sessions_lock = _acquire_sessions_lock( configuration, proto, exclusive=False) result = unpickle(sessions_filepath, logger) if do_lock: _release_sessions_lock(sessions_lock) if not isinstance(result, dict): logger.warning("failed to retrieve active %s sessions from %s" % (proto, sessions_filepath)) result = {} return result
def create_job_object_from_pickled_mrsl(filepath, logger, external_dict): """Helper for submit from pickled mRSL""" job_dict = unpickle(filepath, logger) if not job_dict: return (False, 'could not unpickle mrsl file %s' % filepath) jobo = Job() for (key, value) in job_dict.iteritems(): if str(type(value)) == "<type 'time.struct_time'>": # time.struct_time objects cannot be marshalled in the xmlrpc # version we use value = str(value) if external_dict.has_key(key): # ok, this info can be shown to the user (avoid leaking info that # break anonymity) setattr(jobo, key, value) return (True, jobo)
def load_section_helper(client_id, configuration, section_filename, section_keys, include_meta=False, allow_missing=False): """Load settings section from pickled file. Optional include_meta controls the inclusion of meta data like creator and creation time. Optional allow_missing is used to avoid log errors for sections that may or may not already exist. """ client_dir = client_id_dir(client_id) section_path = os.path.join(configuration.user_settings, client_dir, section_filename) section_dict = unpickle(section_path, configuration.logger, allow_missing) if section_dict and not include_meta: real_keys = section_keys for key in section_dict.keys(): if not key in real_keys: del section_dict[key] return section_dict
def __check_dict(self, stattype_key, stattype_value): """Checking if dict exists on disk and loads it into memory""" # If stattype doesn't exist create it if not self.__gridstat_dict.has_key(stattype_key): self.__gridstat_dict[stattype_key] = {} # If stattype not in memory, check if pickled file exists if not self.__gridstat_dict[stattype_key].has_key(stattype_value): filename = self.__configuration.gridstat_files_dir\ + stattype_key + os.sep + stattype_value.upper()\ + '.pck' if os.path.exists(filename): stat_dict = unpickle(filename, self.__logger) else: stat_dict = None if stat_dict: self.__gridstat_dict[stattype_key][stattype_value] = \ stat_dict else: self.__gridstat_dict[stattype_key][stattype_value] = {}
def add_item_to_pickled_list(path, item, logger): list_ = unpickle(path, logger) output = '' if list_ == []: pass elif not list_: output += 'Failure: could not unpickle current list' return (False, output) # Check if the item already is in the list if item in list_: output += '%s is already in the list' % item return (False, output) # ok, lets add the new item and pickle and save the new list list_.append(item) status = pickle(list_, path, logger) if not status: output += 'pickle error' return (False, output) return (True, '')
return (False, '''Fatal error: Could not get exclusive access or write to %s''' % configuration.grid_stdin) if forceddestination and forceddestination.has_key('RE_NAME'): # add job_id to runtime environment verification history unique_resource_name = forceddestination['UNIQUE_RESOURCE_NAME'] re_name = forceddestination['RE_NAME'] resource_config_filename = configuration.resource_home\ + unique_resource_name + '/config' # open resource config resource_config = unpickle(resource_config_filename, logger) if not resource_config: logger.error('error unpickling resource config') return (False, 'error unpickling resource config') dict_entry = (job_id, client_id) # add entry to runtime verification history if not resource_config.has_key('RUNTVERIFICATION'): resource_config['RUNTVERIFICATION'] = \ {re_name: [dict_entry]} else: before_runt_dict = resource_config['RUNTVERIFICATION'] if not before_runt_dict.has_key(re_name): before_runt_dict[re_name] = [].append(dict_entry)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id, op_header=False) client_dir = client_id_dir(client_id) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) job_ids = accepted['job_id'] action = accepted['action'][-1] src = accepted['src'] dst = accepted['dst'][-1] title_entry = find_entry(output_objects, 'title') title_entry['text'] = '%s live I/O' % configuration.short_title output_objects.append({'object_type': 'header', 'text' : 'Request live communication with jobs'}) if not action in valid_actions: output_objects.append({'object_type': 'error_text', 'text' : 'Invalid action "%s" (supported: %s)' % \ (action, ', '.join(valid_actions))}) return (output_objects, returnvalues.CLIENT_ERROR) if action in post_actions and not correct_handler('POST'): output_objects.append( {'object_type': 'error_text', 'text' : 'Only accepting POST requests to prevent unintended updates'}) return (output_objects, returnvalues.CLIENT_ERROR) if not job_ids or action in interactive_actions: job_id = '' if job_ids: job_id = job_ids[-1] output_objects.append({'object_type': 'text', 'text' : ''' Fill in the live I/O details below to request communication with a running job. Job ID can be a full ID or a wild card pattern using "*" and "?" to match one or more of your job IDs. Use send output without source and destination paths to request upload of the default stdio files from the job on the resource to the associated job_output directory in your MiG home. Destination is a always handled as a directory path to put source files into. Source and destination paths are always taken relative to the job execution directory on the resource and your MiG home respectively. '''}) html = ''' <table class="liveio"> <tr> <td> <form method="post" action="liveio.py"> <table class="liveio"> <tr><td class=centertext> </td></tr> <tr><td> Action:<br /> <input type=radio name=action checked value="send" />send output <input type=radio name=action value="get" />get input </td></tr> <tr><td> Job ID:<br /> <input type=text size=60 name=job_id value="%s" /> </td></tr> <tr><td> Source path(s):<br /> <div id="srcfields"> <input type=text size=60 name=src value="" /><br /> </div> </td></tr> <tr><td> Destination path:<br /> <input type=text size=60 name=dst value="" /> </td></tr> <tr><td> <input type="submit" value="Send request" /> </td></tr> </table> </form> </td> <td> <script type="text/javascript"> fields = 1; max_fields = 64; function addInput() { if (fields < max_fields) { document.getElementById("srcfields").innerHTML += "<input type=text size=60 name=src value='' /><br />"; fields += 1; } else { alert("Maximum " + max_fields + " source fields allowed!"); document.form.add.disabled=true; } } </script> <form name="addsrcform"> <input type="button" onclick="addInput(); return false;" name="add" value="Add another source field" /> </form> </td> </tr> </table> ''' % job_id output_objects.append({'object_type': 'html_form', 'text' : html}) output_objects.append({'object_type': 'text', 'text': ''' Further live job control is avalable through your personal message queues. They provide a basic interface for centrally storing messages under your grid account and can be used to pass messages between jobs or for orchestrating jobs before and during execution. ''' }) output_objects.append({'object_type': 'link', 'destination': 'mqueue.py', 'text': 'Message queue interface'}) return (output_objects, returnvalues.OK) elif action in ['get', 'receive', 'input']: action = 'get' action_desc = 'will be downloaded to the job on the resource' elif action in ['put', 'send', 'output']: action = 'send' action_desc = 'will be uploaded from the job on the resource' else: output_objects.append({'object_type': 'error_text', 'text' : 'Invalid live io action: %s' % action}) return (output_objects, returnvalues.CLIENT_ERROR) output_objects.append({'object_type': 'text', 'text' : 'Requesting live I/O for %s' % ', '.join(job_ids)}) if action == 'get' and (not src or not dst): output_objects.append( {'object_type': 'error_text', 'text': 'src and dst parameters required for live input'}) return (output_objects, returnvalues.CLIENT_ERROR) # Automatic fall back to stdio files if output with no path provided if src: src_text = 'The files ' + ' '.join(src) else: src_text = 'The job stdio files' if dst: dst_text = 'the ' + dst + ' directory' else: dst_text = 'the corresponding job_output directory' # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep filelist = [] for job_id in job_ids: job_id = job_id.strip() # is job currently being executed? # Backward compatibility - all_jobs keyword should match all jobs if job_id == all_jobs: job_id = '*' # Check directory traversal attempts before actual handling to avoid # leaking information about file system layout while allowing # consistent error messages unfiltered_match = glob.glob(base_dir + job_id + '.mRSL') match = [] for server_path in unfiltered_match: real_path = os.path.abspath(server_path) if not valid_user_path(real_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning("%s tried to %s restricted path %s ! (%s)" % \ (client_id, op_name, real_path, job_id)) continue # Insert valid job files in filelist for later treatment match.append(real_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match.... if not match: output_objects.append( {'object_type': 'error_text', 'text' : '%s: You do not have any matching job IDs!' % job_id}) else: filelist += match for filepath in filelist: # Extract jo_id from filepath (replace doesn't modify filepath) mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') job_dict = unpickle(filepath, logger) if not job_dict: status = returnvalues.CLIENT_ERROR output_objects.append( {'object_type': 'error_text', 'text' : ('You can only list status of your own jobs. ' 'Please verify that you submitted the mRSL file ' 'with job id "%s" (Could not unpickle mRSL file %s)' ) % (job_id, filepath)}) continue if job_dict['STATUS'] != 'EXECUTING': output_objects.append( {'object_type': 'text', 'text' : 'Job %s is not currently being executed! Job status: %s' % (job_id, job_dict['STATUS'])}) continue if job_dict['UNIQUE_RESOURCE_NAME'] == 'ARC': output_objects.append( {'object_type': 'text', 'text' : 'Job %s is submitted to ARC, details are not available!' % job_id }) continue last_live_update_dict = {} last_live_update_file = configuration.mig_system_files + os.sep\ + job_id + '.last_live_update' if os.path.isfile(last_live_update_file): last_live_update_dict_unpickled = \ unpickle(last_live_update_file, logger) if not last_live_update_dict_unpickled: output_objects.append({'object_type': 'error_text', 'text' : 'Could not unpickle %s - skipping request!' % last_live_update_file}) continue if not last_live_update_dict_unpickled.has_key( 'LAST_LIVE_UPDATE_REQUEST_TIMESTAMP'): output_objects.append( {'object_type': 'error_text', 'text': 'Could not find needed key in %s.' % last_live_update_file}) continue last_live_update_request = \ last_live_update_dict_unpickled['LAST_LIVE_UPDATE_REQUEST_TIMESTAMP' ] difference = datetime.datetime.now()- last_live_update_request try: min_delay = \ int(configuration.min_seconds_between_live_update_requests) except: min_delay = 30 if difference.seconds < min_delay: output_objects.append( {'object_type': 'error_text', 'text': ('Request not allowed, you must wait at least ' \ '%s seconds between live update requests!' ) % min_delay}) continue # save this request to file to avoid DoS from a client request loop. last_live_update_dict['LAST_LIVE_UPDATE_REQUEST_TIMESTAMP'] = \ datetime.datetime.now() pickle_ret = pickle(last_live_update_dict, last_live_update_file, logger) if not pickle_ret: output_objects.append( {'object_type': 'error_text', 'text' : 'Error saving live io request timestamp to last_live_update ' 'file, request not sent!'}) continue # # # ## job is being executed right now, send live io request to frontend # # # get resource_config, needed by scp_file_to_resource #(status, resource_config) = get_resource_configuration( # resource_home, unique_resource_name, logger) resource_config = job_dict['RESOURCE_CONFIG'] (status, exe) = get_resource_exe(resource_config, job_dict['EXE'], logger) if not status: output_objects.append( {'object_type': 'error_text', 'text' : 'Could not get exe configuration for job %s' % job_id}) continue local_file = '%s.%supdate' % (job_dict['LOCALJOBNAME'], action) if not os.path.exists(local_file): # create try: filehandle = open(local_file, 'w') filehandle.write('job_id ' + job_dict['JOB_ID'] + '\n') filehandle.write('localjobname ' + job_dict['LOCALJOBNAME'] + '\n') filehandle.write('execution_user ' + exe['execution_user'] + '\n') filehandle.write('execution_node ' + exe['execution_node'] + '\n') filehandle.write('execution_dir ' + exe['execution_dir'] + '\n') filehandle.write('target liveio\n') # Leave defaults src and dst to FE script if not provided if src: filehandle.write('source ' + ' '.join(src) + '\n') if dst: filehandle.write('destination ' + dst + '\n') # Backward compatible test for shared_fs - fall back to scp if exe.has_key('shared_fs') and exe['shared_fs']: filehandle.write('copy_command cp\n') filehandle.write('copy_frontend_prefix \n') filehandle.write('copy_execution_prefix \n') else: filehandle.write('copy_command scp -B\n') filehandle.write('copy_frontend_prefix ${frontend_user}@${frontend_node}:\n' ) filehandle.write('copy_execution_prefix ${execution_user}@${execution_node}:\n' ) filehandle.write('### END OF SCRIPT ###\n') filehandle.close() except Exception, exc: pass if not os.path.exists(local_file): output_objects.append( {'object_type': 'error_text', 'text' : '.%supdate file not available on %s server' % \ (action, configuration.short_title)}) continue scpstatus = copy_file_to_resource(local_file, '%s.%supdate' % (job_dict['LOCALJOBNAME'], action), resource_config, logger) if not scpstatus: output_objects.append( {'object_type': 'error_text', 'text' : 'Error sending request for live io to resource!'}) continue else: output_objects.append( {'object_type': 'text', 'text' : 'Request for live io was successfully sent to the resource!' }) output_objects.append( {'object_type': 'text', 'text' : '%s %s and should become available in %s in a minute.' % \ (src_text, action_desc, dst_text) }) if action == 'send': if not dst: target_path = '%s/%s/*' % (job_output_dir, job_id) else: target_path = dst output_objects.append({'object_type': 'link', 'destination' : 'ls.py?path=%s' % target_path, 'text': 'View uploaded files'}) else: output_objects.append({'object_type': 'link', 'destination' : 'ls.py?path=%s' % ';path='.join(src), 'text': 'View files for download'}) try: os.remove(local_file) except Exception, exc: pass
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id) client_dir = client_id_dir(client_id) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) if not correct_handler('POST'): output_objects.append( {'object_type': 'error_text', 'text' : 'Only accepting POST requests to prevent unintended updates'}) return (output_objects, returnvalues.CLIENT_ERROR) patterns = accepted['job_id'] if not patterns: output_objects.append({'object_type': 'error_text', 'text' : 'No job_id specified!'}) return (output_objects, returnvalues.NO_SUCH_JOB_ID) # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep filelist = [] keywords_dict = mrslkeywords.get_keywords_dict(configuration) for pattern in patterns: pattern = pattern.strip() # Backward compatibility - all_jobs keyword should match all jobs if pattern == all_jobs: pattern = '*' # Check directory traversal attempts before actual handling to avoid # leaking information about file system layout while allowing # consistent error messages unfiltered_match = glob.glob(base_dir + pattern + '.mRSL') match = [] for server_path in unfiltered_match: real_path = os.path.abspath(server_path) if not valid_user_path(real_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning('%s tried to %s restricted path %s ! (%s)' % (client_id, op_name, real_path, pattern)) continue # Insert valid job files in filelist for later treatment match.append(real_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match if not match: output_objects.append( {'object_type': 'error_text', 'text' : '%s: You do not have any matching job IDs!' % pattern}) status = returnvalues.CLIENT_ERROR else: filelist += match # resubmit is hard on the server if len(filelist) > 100: output_objects.append({'object_type': 'error_text', 'text' : 'Too many matching jobs (%s)!' % len(filelist)}) return (output_objects, returnvalues.CLIENT_ERROR) resubmitobjs = [] status = returnvalues.OK for filepath in filelist: mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') # ("Resubmitting job with job_id: %s" % job_id) resubmitobj = {'object_type': 'resubmitobj', 'job_id': job_id} mrsl_dict = unpickle(filepath, logger) if not mrsl_dict: resubmitobj['message'] = "No such job: %s (%s)" % (job_id, mrsl_file) status = returnvalues.CLIENT_ERROR resubmitobjs.append(resubmitobj) continue resubmit_items = keywords_dict.keys() # loop selected keywords and create mRSL string resubmit_job_string = '' for dict_elem in resubmit_items: value = '' # Extract job value with fallback to default to support optional # fields job_value = mrsl_dict.get(dict_elem, keywords_dict[dict_elem]['Value']) if keywords_dict[dict_elem]['Type'].startswith('multiplekeyvalues'): for (elem_key, elem_val) in job_value: if elem_key: value += '%s=%s\n' % (str(elem_key).strip(), str(elem_val).strip()) elif keywords_dict[dict_elem]['Type'].startswith('multiple'): for elem in job_value: if elem: value += '%s\n' % str(elem).rstrip() else: if str(job_value): value += '%s\n' % str(job_value).rstrip() # Only insert keywords with an associated value if value: if value.rstrip() != '': resubmit_job_string += '''::%s:: %s ''' % (dict_elem, value.rstrip()) # save tempfile (filehandle, tempfilename) = \ tempfile.mkstemp(dir=configuration.mig_system_files, text=True) os.write(filehandle, resubmit_job_string) os.close(filehandle) # submit job the usual way (new_job_status, msg, new_job_id) = new_job(tempfilename, client_id, configuration, False, True) if not new_job_status: resubmitobj['status'] = False resubmitobj['message'] = msg status = returnvalues.SYSTEM_ERROR resubmitobjs.append(resubmitobj) continue # o.out("Resubmit failed: %s" % msg) # o.reply_and_exit(o.ERROR) resubmitobj['status'] = True resubmitobj['new_job_id'] = new_job_id resubmitobjs.append(resubmitobj) # o.out("Resubmit successful: %s" % msg) # o.out("%s" % msg) output_objects.append({'object_type': 'resubmitobjs', 'resubmitobjs' : resubmitobjs}) return (output_objects, status)
# Traverse mRSL dir and update cache for (root, _, files) in os.walk(root_dir, topdown=True): # skip all dot dirs - they are from repos etc and _not_ jobs if root.find(os.sep + '.') != -1: continue for name in files: filename = os.path.join(root, name) # Only files modified since last update is checked if os.path.getmtime(filename) > last_buildtime: job_dict = unpickle(filename, self.__logger) if not job_dict: msg = 'gridstat::update() could not load: %s '\ % filename self.__logger.error(msg) continue job_vgrids = validated_vgrid_list(self.__configuration, job_dict) for job_vgrid_name in job_vgrids: # Update the statistics and cache # from the job details job_vgrid_name = job_vgrid_name.upper()
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id) client_dir = client_id_dir(client_id) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) if not correct_handler('POST'): output_objects.append( {'object_type': 'error_text', 'text' : 'Only accepting POST requests to prevent unintended updates'}) return (output_objects, returnvalues.CLIENT_ERROR) patterns = accepted['job_id'] # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep status = returnvalues.OK filelist = [] for pattern in patterns: pattern = pattern.strip() # Backward compatibility - all_jobs keyword should match all jobs if pattern == all_jobs: pattern = '*' # Check directory traversal attempts before actual handling to avoid # leaking information about file system layout while allowing # consistent error messages unfiltered_match = glob.glob(base_dir + pattern + '.mRSL') match = [] for server_path in unfiltered_match: real_path = os.path.abspath(server_path) if not valid_user_path(real_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning('%s tried to %s restricted path %s ! (%s)' % (client_id, op_name, real_path, pattern)) continue # Insert valid job files in filelist for later treatment match.append(real_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match if not match: output_objects.append( {'object_type': 'error_text', 'text' : '%s: You do not have any matching job IDs!' % pattern}) status = returnvalues.CLIENT_ERROR else: filelist += match # job schedule is hard on the server, limit if len(filelist) > 100: output_objects.append({'object_type': 'error_text', 'text' : 'Too many matching jobs (%s)!' % len(filelist)}) return (output_objects, returnvalues.CLIENT_ERROR) saveschedulejobs = [] for filepath in filelist: # Extract job_id from filepath (replace doesn't modify filepath) mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') saveschedulejob = {'object_type': 'saveschedulejob', 'job_id': job_id} dict = unpickle(filepath, logger) if not dict: saveschedulejob['message'] = \ ('The file containing the information' \ ' for job id %s could not be opened!' \ ' You can only read schedule for ' \ 'your own jobs!') % job_id saveschedulejobs.append(saveschedulejob) status = returnvalues.CLIENT_ERROR continue saveschedulejob['oldstatus'] = dict['STATUS'] # Is the job status pending? possible_schedule_states = ['QUEUED', 'RETRY', 'FROZEN'] if not dict['STATUS'] in possible_schedule_states: saveschedulejob['message'] = \ 'You can only read schedule for jobs with status: %s.'\ % ' or '.join(possible_schedule_states) saveschedulejobs.append(saveschedulejob) continue # notify queue if not send_message_to_grid_script('JOBSCHEDULE ' + job_id + '\n', logger, configuration): output_objects.append( {'object_type': 'error_text', 'text' : 'Error sending message to grid_script, update may fail.' }) status = returnvalues.SYSTEM_ERROR continue saveschedulejobs.append(saveschedulejob) savescheduleinfo = """Please find any available job schedule status in verbose job status output.""" output_objects.append({'object_type': 'saveschedulejobs', 'saveschedulejobs': saveschedulejobs, 'savescheduleinfo': savescheduleinfo}) return (output_objects, status)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id, op_header=False) client_dir = client_id_dir(client_id) status = returnvalues.OK defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: logger.error("jobstatus input validation failed: %s" % accepted) return (accepted, returnvalues.CLIENT_ERROR) flags = ''.join(accepted['flags']) max_jobs = int(accepted['max_jobs'][-1]) order = 'unsorted ' if sorted(flags): order = 'sorted ' patterns = accepted['job_id'] project_names = accepted['project_name'] if len(project_names) > 0: for project_name in project_names: project_name_job_ids = \ get_job_ids_with_specified_project_name(client_id, project_name, configuration.mrsl_files_dir, logger) patterns.extend(project_name_job_ids) # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep output_objects.append({'object_type': 'header', 'text' : '%s %s job status' % \ (configuration.short_title, order)}) if not patterns: output_objects.append({'object_type': 'error_text', 'text' : 'No job_id specified!'}) return (output_objects, returnvalues.NO_SUCH_JOB_ID) if verbose(flags): for flag in flags: output_objects.append({'object_type': 'text', 'text' : '%s using flag: %s' % (op_name, flag)}) if not os.path.isdir(base_dir): output_objects.append( {'object_type': 'error_text', 'text' : ('You have not been created as a user on the %s server! ' \ 'Please contact the %s team.') % \ (configuration.short_title, configuration.short_title)}) return (output_objects, returnvalues.CLIENT_ERROR) filelist = [] for pattern in patterns: pattern = pattern.strip() # Backward compatibility - all_jobs keyword should match all jobs if pattern == all_jobs: pattern = '*' # Check directory traversal attempts before actual handling to # avoid leaking information about file system layout while # allowing consistent error messages unfiltered_match = glob.glob(base_dir + pattern + '.mRSL') match = [] for server_path in unfiltered_match: real_path = os.path.abspath(server_path) if not valid_user_path(real_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning('%s tried to %s restricted path %s ! (%s)' % (client_id, op_name, real_path, pattern)) continue # Insert valid job files in filelist for later treatment match.append(real_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match.... if not match: output_objects.append( {'object_type': 'error_text', 'text' : '%s: You do not have any matching job IDs!' % pattern}) status = returnvalues.CLIENT_ERROR else: filelist += match if sorted(flags): sort(filelist) if max_jobs < len(filelist): output_objects.append( {'object_type': 'text', 'text' : 'Only showing first %d of the %d matching jobs as requested' % (max_jobs, len(filelist))}) filelist = filelist[:max_jobs] # Iterate through jobs and print details for each job_list = {'object_type': 'job_list', 'jobs': []} for filepath in filelist: # Extract job_id from filepath (replace doesn't modify filepath) mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') job_dict = unpickle(filepath, logger) if not job_dict: status = returnvalues.CLIENT_ERROR output_objects.append( {'object_type': 'error_text', 'text' : 'No such job: %s (could not load mRSL file %s)' % \ (job_id, filepath)}) continue # Expand any job variables before use job_dict = expand_variables(job_dict) job_obj = {'object_type': 'job', 'job_id': job_id} job_obj['status'] = job_dict['STATUS'] time_fields = [ 'VERIFIED', 'VERIFIED_TIMESTAMP', 'RECEIVED_TIMESTAMP', 'QUEUED_TIMESTAMP', 'SCHEDULE_TIMESTAMP', 'EXECUTING_TIMESTAMP', 'FINISHED_TIMESTAMP', 'FAILED_TIMESTAMP', 'CANCELED_TIMESTAMP', ] for name in time_fields: if job_dict.has_key(name): # time objects cannot be marshalled, asctime if timestamp try: job_obj[name.lower()] = time.asctime(job_dict[name]) except Exception, exc: # not a time object, just add job_obj[name.lower()] = job_dict[name] ########################################### # ARC job status retrieval on demand: # But we should _not_ update the status in the mRSL files, since # other MiG code might rely on finding only valid "MiG" states. if configuration.arc_clusters and \ job_dict.get('UNIQUE_RESOURCE_NAME', 'unset') == 'ARC' \ and job_dict['STATUS'] == 'EXECUTING': try: home = os.path.join(configuration.user_home, client_dir) arcsession = arc.Ui(home) arcstatus = arcsession.jobStatus(job_dict['EXE']) job_obj['status'] = arcstatus['status'] except arc.ARCWrapperError, err: logger.error('Error retrieving ARC job status: %s' % \ err.what()) job_obj['status'] += '(Error: ' + err.what() + ')' except arc.NoProxyError, err: logger.error('While retrieving ARC job status: %s' % \ err.what()) job_obj['status'] += '(Error: ' + err.what() + ')'
if __name__ == '__main__': print 'starting translation test. Args: ' , len(sys.argv) logger.debug('translation for file ' + sys.argv[1] + ' starts') if len(sys.argv) > 1: fname = sys.argv[1] parsed = '.'.join([fname,'parsed']) translated = '.'.join([parsed,'xrsl']) try: import shared.mrslparser as p import shared.fileio as fileio (presult,errors) = p.parse(fname, 'test-id', '+No+Client+Id',None,parsed) if not presult: print 'Errors:\n%s' % errors else: print 'Parsing OK, now translating' mrsl_dict = fileio.unpickle(parsed,logger) (xrsl,script,name) = translate(mrsl_dict,'test-name') print '\n'.join(['Job name',name,'script',script,'XRSL']) fileio.write_file(script, "test-id.sh", logger) print (format_xrsl(xrsl)) fileio.write_file("%s" % xrsl, translated, logger) print 'done' except Exception, err: print 'Error.' print err.__str__()
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id, op_header=False) defaults = signature()[1] title_entry = find_entry(output_objects, 'title') label = "%s" % configuration.site_vgrid_label title_entry['text'] = '%s Workflows' % label # NOTE: Delay header entry here to include vgrid_name (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) vgrid_name = accepted['vgrid_name'][-1] operation = accepted['operation'][-1] flags = ''.join(accepted['flags'][-1]) if not vgrid_is_owner_or_member(vgrid_name, client_id, configuration): output_objects.append({ 'object_type': 'error_text', 'text': '''You must be an owner or member of %s vgrid to access the workflows.''' % vgrid_name }) return (output_objects, returnvalues.CLIENT_ERROR) if not operation in allowed_operations: output_objects.append({ 'object_type': 'error_text', 'text': '''Operation must be one of %s.''' % ', '.join(allowed_operations) }) return (output_objects, returnvalues.OK) if operation in show_operations: # jquery support for tablesorter (and unused confirmation dialog) # table initially sorted by 0 (last update / date) refresh_call = 'ajax_workflowjobs("%s", "%s")' % (vgrid_name, flags) table_spec = { 'table_id': 'workflowstable', 'sort_order': '[[0,1]]', 'refresh_call': refresh_call } (add_import, add_init, add_ready) = man_base_js(configuration, [table_spec]) if operation == "show": add_ready += '%s;' % refresh_call add_ready += ''' /* Init variables helper as foldable but closed and with individual heights */ $(".variables-accordion").accordion({ collapsible: true, active: false, heightStyle: "content" }); /* fix and reduce accordion spacing */ $(".ui-accordion-header").css("padding-top", 0) .css("padding-bottom", 0).css("margin", 0); /* NOTE: requires managers CSS fix for proper tab bar height */ $(".workflow-tabs").tabs(); $("#logarea").scrollTop($("#logarea")[0].scrollHeight); ''' title_entry['script']['advanced'] += add_import title_entry['script']['init'] += add_init title_entry['script']['ready'] += add_ready output_objects.append({ 'object_type': 'html_form', 'text': man_base_html(configuration) }) output_objects.append({ 'object_type': 'header', 'text': '%s Workflows for %s' % (label, vgrid_name) }) logger.info('vgridworkflows %s %s' % (vgrid_name, operation)) # Iterate through jobs and list details for each trigger_jobs = [] log_content = '' if operation in list_operations: trigger_job_dir = os.path.join( configuration.vgrid_home, os.path.join(vgrid_name, '.%s.jobs' % configuration.vgrid_triggers)) trigger_job_pending_dir = os.path.join(trigger_job_dir, 'pending_states') trigger_job_final_dir = os.path.join(trigger_job_dir, 'final_states') if makedirs_rec(trigger_job_pending_dir, configuration) \ and makedirs_rec(trigger_job_final_dir, configuration): abs_vgrid_dir = '%s/' \ % os.path.abspath(os.path.join(configuration.vgrid_files_home, vgrid_name)) for filename in os.listdir(trigger_job_pending_dir): trigger_job_filepath = \ os.path.join(trigger_job_pending_dir, filename) trigger_job = unpickle(trigger_job_filepath, logger) serverjob_filepath = \ os.path.join(configuration.mrsl_files_dir, os.path.join( client_id_dir(trigger_job['owner']), '%s.mRSL' % trigger_job['jobid'])) serverjob = unpickle(serverjob_filepath, logger) if serverjob: if serverjob['STATUS'] in pending_states: trigger_event = trigger_job['event'] trigger_rule = trigger_job['rule'] trigger_action = trigger_event['event_type'] trigger_time = time.ctime(trigger_event['time_stamp']) trigger_path = '%s %s' % \ (trigger_event['src_path'].replace( abs_vgrid_dir, ''), trigger_event['dest_path'].replace( abs_vgrid_dir, '')) job = { 'object_type': 'trigger_job', 'job_id': trigger_job['jobid'], 'rule_id': trigger_rule['rule_id'], 'path': trigger_path, 'action': trigger_action, 'time': trigger_time, 'status': serverjob['STATUS'] } if not job['rule_id'].startswith(img_trigger_prefix) \ or verbose(flags): trigger_jobs.append(job) elif serverjob['STATUS'] in final_states: src_path = os.path.join(trigger_job_pending_dir, filename) dest_path = os.path.join(trigger_job_final_dir, filename) move_file(src_path, dest_path, configuration) else: logger.error( 'Trigger job: %s, unknown state: %s' % (trigger_job['jobid'], serverjob['STATUS'])) log_content = read_trigger_log(configuration, vgrid_name, flags) if operation in show_operations: # Always run as rule creator to avoid users being able to act on behalf # of ANY other user using triggers (=exploit) extra_fields = [ ('path', None), ('match_dirs', ['False', 'True']), ('match_recursive', ['False', 'True']), ('changes', [keyword_all] + valid_trigger_changes), ('action', [keyword_auto] + valid_trigger_actions), ('arguments', None), ('run_as', client_id), ] # NOTE: we do NOT show saved template contents - see addvgridtriggers optional_fields = [('rate_limit', None), ('settle_time', None)] # Only include system triggers in verbose mode if verbose(flags): system_filter = [] else: system_filter = [('rule_id', '%s_.*' % img_trigger_prefix)] (init_status, oobjs) = vgrid_add_remove_table(client_id, vgrid_name, 'trigger', 'vgridtrigger', configuration, extra_fields + optional_fields, filter_items=system_filter) if not init_status: output_objects.append({ 'object_type': 'error_text', 'text': 'failed to load triggers: %s' % oobjs }) return (output_objects, returnvalues.SYSTEM_ERROR) # Generate variable helper values for a few concrete samples for help # text vars_html = '' dummy_rule = {'run_as': client_id, 'vgrid_name': vgrid_name} samples = [('input.txt', 'modified'), ('input/image42.raw', 'changed')] for (path, change) in samples: vgrid_path = os.path.join(vgrid_name, path) vars_html += "<b>Expanded variables when %s is %s:</b><br/>" % \ (vgrid_path, change) expanded = get_path_expand_map(vgrid_path, dummy_rule, change) for (key, val) in expanded.items(): vars_html += " %s: %s<br/>" % (key, val) commands_html = '' commands = get_usage_map(configuration) for usage in commands.values(): commands_html += " %s<br/>" % usage helper_html = """ <div class='variables-accordion'> <h4>Help on available trigger variable names and values</h4> <p> Triggers can use a number of helper variables on the form +TRIGGERXYZ+ to dynamically act on targets. Some of the values are bound to the rule owner the %s while the remaining ones are automatically expanded for the particular trigger target as shown in the following examples:<br/> %s </p> <h4>Help on available trigger commands and arguments</h4> <p> It is possible to set up trigger rules that basically run any operation with a side effect you could manually do on %s. I.e. like submitting/cancelling a job, creating/moving/deleting a file or directory and so on. When you select 'command' as the action for a trigger rule, you have the following commands at your disposal:<br/> %s </p> </div> """ % (label, vars_html, configuration.short_title, commands_html) # Make page with manage triggers tab and active jobs and log tab output_objects.append({ 'object_type': 'html_form', 'text': ''' <div id="wrap-tabs" class="workflow-tabs"> <ul> <li><a href="#manage-tab">Manage Triggers</a></li> <li><a href="#jobs-tab">Active Trigger Jobs</a></li> </ul> ''' }) # Display existing triggers and form to add new ones output_objects.append({ 'object_type': 'html_form', 'text': ''' <div id="manage-tab"> ''' }) output_objects.append({ 'object_type': 'sectionheader', 'text': 'Manage Triggers' }) output_objects.extend(oobjs) output_objects.append({ 'object_type': 'html_form', 'text': helper_html }) if configuration.site_enable_crontab: output_objects.append({ 'object_type': 'html_form', 'text': ''' <p>You can combine these workflows with the personal ''' }) output_objects.append({ 'object_type': 'link', 'destination': 'crontab.py', 'class': 'crontablink iconspace', 'text': 'schedule task' }) output_objects.append({ 'object_type': 'html_form', 'text': ''' facilities in case you want to trigger flows at given times rather than only in reaction to file system events.</p> ''' }) output_objects.append({ 'object_type': 'html_form', 'text': ''' </div> ''' }) # Display active trigger jobs and recent logs for this vgrid output_objects.append({ 'object_type': 'html_form', 'text': ''' <div id="jobs-tab"> ''' }) output_objects.append({ 'object_type': 'sectionheader', 'text': 'Active Trigger Jobs' }) output_objects.append({ 'object_type': 'table_pager', 'entry_name': 'job', 'default_entries': default_pager_entries }) output_objects.append({ 'object_type': 'trigger_job_list', 'trigger_jobs': trigger_jobs }) if operation in show_operations: output_objects.append({ 'object_type': 'sectionheader', 'text': 'Trigger Log' }) output_objects.append({ 'object_type': 'trigger_log', 'log_content': log_content }) if operation in show_operations: output_objects.append({ 'object_type': 'html_form', 'text': ''' </div> ''' }) output_objects.append({ 'object_type': 'html_form', 'text': ''' </div> ''' }) return (output_objects, returnvalues.OK)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id) client_dir = client_id_dir(client_id) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) patterns = accepted['job_id'] if not safe_handler(configuration, 'post', op_name, client_id, get_csrf_limit(configuration), accepted): output_objects.append({ 'object_type': 'error_text', 'text': '''Only accepting CSRF-filtered POST requests to prevent unintended updates''' }) return (output_objects, returnvalues.CLIENT_ERROR) if not configuration.site_enable_jobs: output_objects.append({ 'object_type': 'error_text', 'text': '''Job execution is not enabled on this system''' }) return (output_objects, returnvalues.SYSTEM_ERROR) # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep status = returnvalues.OK filelist = [] for pattern in patterns: pattern = pattern.strip() # Backward compatibility - all_jobs keyword should match all jobs if pattern == all_jobs: pattern = '*' # Check directory traversal attempts before actual handling to avoid # leaking information about file system layout while allowing # consistent error messages unfiltered_match = glob.glob(base_dir + pattern + '.mRSL') match = [] for server_path in unfiltered_match: # IMPORTANT: path must be expanded to abs for proper chrooting abs_path = os.path.abspath(server_path) if not valid_user_path(configuration, abs_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.warning('%s tried to %s restricted path %s ! (%s)' % (client_id, op_name, abs_path, pattern)) continue # Insert valid job files in filelist for later treatment match.append(abs_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match if not match: output_objects.append({ 'object_type': 'error_text', 'text': '%s: You do not have any matching job IDs!' % pattern }) status = returnvalues.CLIENT_ERROR else: filelist += match # job schedule is hard on the server, limit if len(filelist) > 100: output_objects.append({ 'object_type': 'error_text', 'text': 'Too many matching jobs (%s)!' % len(filelist) }) return (output_objects, returnvalues.CLIENT_ERROR) saveschedulejobs = [] for filepath in filelist: # Extract job_id from filepath (replace doesn't modify filepath) mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') saveschedulejob = {'object_type': 'saveschedulejob', 'job_id': job_id} dict = unpickle(filepath, logger) if not dict: saveschedulejob['message'] = \ ('The file containing the information' \ ' for job id %s could not be opened!' \ ' You can only read schedule for ' \ 'your own jobs!') % job_id saveschedulejobs.append(saveschedulejob) status = returnvalues.CLIENT_ERROR continue saveschedulejob['oldstatus'] = dict['STATUS'] # Is the job status pending? possible_schedule_states = ['QUEUED', 'RETRY', 'FROZEN'] if not dict['STATUS'] in possible_schedule_states: saveschedulejob['message'] = \ 'You can only read schedule for jobs with status: %s.'\ % ' or '.join(possible_schedule_states) saveschedulejobs.append(saveschedulejob) continue # notify queue if not send_message_to_grid_script('JOBSCHEDULE ' + job_id + '\n', logger, configuration): output_objects.append({ 'object_type': 'error_text', 'text': 'Error sending message to grid_script, update may fail.' }) status = returnvalues.SYSTEM_ERROR continue saveschedulejobs.append(saveschedulejob) savescheduleinfo = """Please find any available job schedule status in verbose job status output.""" output_objects.append({ 'object_type': 'saveschedulejobs', 'saveschedulejobs': saveschedulejobs, 'savescheduleinfo': savescheduleinfo }) return (output_objects, status)
o.reply_and_exit(o.ERROR) # Check that resource address matches request source # TODO: get real ip and enable this check # remote_ip = str(os.getenv("REMOTE_ADDR")) # resource_ip = "0.0.0.0" # if remote_ip != resource_ip: # print "Warning: job request not sent from expected resource address!" # logger.warning("job request not issued from address of resource! (%s != %s)", remote_ip, resource_ip) # TODO: check that the person who submitted the job (where the session ID points) is also the one that submitted the # received jobid (to avoid a verified user specifies another users job id) mrslfile = configuration.sessid_to_mrsl_link_home + sessionid + '.mRSL' mrsldict = unpickle(mrslfile, logger) if not mrsldict: o.out('requestinteractivejob error! Could not open mrsl file') o.reply_and_exit(o.ERROR) job_submitter_client_id = mrsldict['USER_CERT'] o.out('job_submitter_client_id: %s' % job_submitter_client_id) mrsl_jobid = mrsldict['JOB_ID'] if not jobid == mrsl_jobid: o.out('requestinteractivejob error! Wrong job_id specified!') o.reply_and_exit(o.ERROR) # TODO: check the status of the specified job(id) and verify it has not previously been executed. # The status must be ? (What about RETRY?)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id) client_dir = client_id_dir(client_id) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) patterns = accepted['job_id'] action = accepted['action'][-1] if not safe_handler(configuration, 'post', op_name, client_id, get_csrf_limit(configuration), accepted): output_objects.append({ 'object_type': 'error_text', 'text': '''Only accepting CSRF-filtered POST requests to prevent unintended updates''' }) return (output_objects, returnvalues.CLIENT_ERROR) if not configuration.site_enable_jobs: output_objects.append({ 'object_type': 'error_text', 'text': '''Job execution is not enabled on this system''' }) return (output_objects, returnvalues.SYSTEM_ERROR) if not action in valid_actions.keys(): output_objects.append({ 'object_type': 'error_text', 'text': 'Invalid job action "%s" (only %s supported)' % (action, ', '.join(valid_actions.keys())) }) return (output_objects, returnvalues.CLIENT_ERROR) new_state = valid_actions[action] # Please note that base_dir must end in slash to avoid access to other # user dirs when own name is a prefix of another user name base_dir = \ os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) + os.sep status = returnvalues.OK filelist = [] for pattern in patterns: pattern = pattern.strip() # Backward compatibility - all_jobs keyword should match all jobs if pattern == all_jobs: pattern = '*' # Check directory traversal attempts before actual handling to avoid # leaking information about file system layout while allowing # consistent error messages unfiltered_match = glob.glob(base_dir + pattern + '.mRSL') match = [] for server_path in unfiltered_match: # IMPORTANT: path must be expanded to abs for proper chrooting abs_path = os.path.abspath(server_path) if not valid_user_path(configuration, abs_path, base_dir, True): # out of bounds - save user warning for later to allow # partial match: # ../*/* is technically allowed to match own files. logger.error( '%s tried to use %s %s outside own home! (pattern %s)' % (client_id, op_name, abs_path, pattern)) continue # Insert valid job files in filelist for later treatment match.append(abs_path) # Now actually treat list of allowed matchings and notify if no # (allowed) match if not match: output_objects.append({ 'object_type': 'error_text', 'text': '%s: You do not have any matching job IDs!' % pattern }) status = returnvalues.CLIENT_ERROR else: filelist += match # job state change is hard on the server, limit if len(filelist) > 500: output_objects.append({ 'object_type': 'error_text', 'text': 'Too many matching jobs (%s)!' % len(filelist) }) return (output_objects, returnvalues.CLIENT_ERROR) changedstatusjobs = [] for filepath in filelist: # Extract job_id from filepath (replace doesn't modify filepath) mrsl_file = filepath.replace(base_dir, '') job_id = mrsl_file.replace('.mRSL', '') changedstatusjob = { 'object_type': 'changedstatusjob', 'job_id': job_id } job_dict = unpickle(filepath, logger) if not job_dict: changedstatusjob['message'] = '''The file containing the information for job id %s could not be opened! You can only %s your own jobs!''' % (job_id, action) changedstatusjobs.append(changedstatusjob) status = returnvalues.CLIENT_ERROR continue changedstatusjob['oldstatus'] = job_dict['STATUS'] # Is the job status compatible with action? possible_cancel_states = [ 'PARSE', 'QUEUED', 'RETRY', 'EXECUTING', 'FROZEN' ] if action == 'cancel' and \ not job_dict['STATUS'] in possible_cancel_states: changedstatusjob['message'] = \ 'You can only cancel jobs with status: %s.'\ % ' or '.join(possible_cancel_states) status = returnvalues.CLIENT_ERROR changedstatusjobs.append(changedstatusjob) continue possible_freeze_states = ['QUEUED', 'RETRY'] if action == 'freeze' and \ not job_dict['STATUS'] in possible_freeze_states: changedstatusjob['message'] = \ 'You can only freeze jobs with status: %s.'\ % ' or '.join(possible_freeze_states) status = returnvalues.CLIENT_ERROR changedstatusjobs.append(changedstatusjob) continue possible_thaw_states = ['FROZEN'] if action == 'thaw' and \ not job_dict['STATUS'] in possible_thaw_states: changedstatusjob['message'] = \ 'You can only thaw jobs with status: %s.'\ % ' or '.join(possible_thaw_states) status = returnvalues.CLIENT_ERROR changedstatusjobs.append(changedstatusjob) continue # job action is handled by changing the STATUS field, notifying the # job queue and making sure the server never submits jobs with status # FROZEN or CANCELED. # file is repickled to ensure newest information is used, job_dict # might be old if another script has modified the file. if not unpickle_and_change_status(filepath, new_state, logger): output_objects.append({ 'object_type': 'error_text', 'text': 'Job status could not be changed to %s!' % new_state }) status = returnvalues.SYSTEM_ERROR # Avoid key error and make sure grid_script gets expected number of # arguments if not job_dict.has_key('UNIQUE_RESOURCE_NAME'): job_dict['UNIQUE_RESOURCE_NAME'] = \ 'UNIQUE_RESOURCE_NAME_NOT_FOUND' if not job_dict.has_key('EXE'): job_dict['EXE'] = 'EXE_NAME_NOT_FOUND' # notify queue if not send_message_to_grid_script( 'JOBACTION ' + job_id + ' ' + job_dict['STATUS'] + ' ' + new_state + ' ' + job_dict['UNIQUE_RESOURCE_NAME'] + ' ' + job_dict['EXE'] + '\n', logger, configuration): output_objects.append({ 'object_type': 'error_text', 'text': '''Error sending message to grid_script, job may still be in the job queue.''' }) status = returnvalues.SYSTEM_ERROR continue changedstatusjob['newstatus'] = new_state changedstatusjobs.append(changedstatusjob) output_objects.append({ 'object_type': 'changedstatusjobs', 'changedstatusjobs': changedstatusjobs }) return (output_objects, status)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id, op_header=False) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) vgrid_name = accepted['vgrid_name'][-1] if not vgrid_is_owner_or_member(vgrid_name, client_id, configuration): output_objects.append({'object_type': 'error_text', 'text': '''You must be an owner or member of %s vgrid to access the workflows.''' % vgrid_name}) return (output_objects, returnvalues.CLIENT_ERROR) title_entry = find_entry(output_objects, 'title') title_entry['text'] = '%s Workflows' \ % configuration.site_vgrid_label title_entry['style'] = themed_styles(configuration) title_entry['javascript'] = \ ''' <script type="text/javascript" src="/images/js/jquery.js"></script> <script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script> <script type="text/javascript" src="/images/js/jquery.tablesorter.pager.js"></script> <script type="text/javascript" src="/images/js/jquery.tablesorter.widgets.js"></script> <script type="text/javascript" src="/images/js/jquery-ui.js"></script> <script type="text/javascript"> $(document).ready(function() { $("#logarea").scrollTop($("#logarea")[0].scrollHeight); // table initially sorted by 0 (last update / date) var sortOrder = [[0,1]]; // use image path for sorting if there is any inside var imgTitle = function(contents) { var key = $(contents).find("a").attr("class"); if (key == null) { key = $(contents).html(); } return key; } $("#workflowstable").tablesorter({widgets: ["zebra", "saveSort"], sortList:sortOrder, textExtraction: imgTitle }) .tablesorterPager({ container: $("#pager"), size: %s }); } ); </script> ''' \ % default_pager_entries output_objects.append({'object_type': 'html_form', 'text': ''' <div id="confirm_dialog" title="Confirm" style="background:#fff;"> <div id="confirm_text"><!-- filled by js --></div> <textarea cols="72" rows="10" id="confirm_input" style="display:none;"></textarea> </div> '''}) output_objects.append({'object_type': 'header', 'text': '%s Workflows for %s' % (configuration.site_vgrid_label, vgrid_name)}) logger.info('vgridworkflows %s' % vgrid_name) # Display active trigger jobs for this vgrid output_objects.append({'object_type': 'sectionheader', 'text': 'Active Trigger Jobs'}) html = '<table><thead><tr>' html += '<th>Job ID</th>' html += '<th>Rule</th>' html += '<th>Path</th>' html += '<th>Change</th>' html += '<th>Time</th>' html += '<th>Status</th>' html += '</tr></thead>' html += '<tbody>' trigger_job_dir = os.path.join(configuration.vgrid_home, os.path.join(vgrid_name, '.%s.jobs' % configuration.vgrid_triggers)) trigger_job_pending_dir = os.path.join(trigger_job_dir, 'pending_states') trigger_job_final_dir = os.path.join(trigger_job_dir, 'final_states' ) if makedirs_rec(trigger_job_pending_dir, logger) \ and makedirs_rec(trigger_job_final_dir, logger): abs_vgrid_dir = '%s/' \ % os.path.abspath(os.path.join(configuration.vgrid_files_home, vgrid_name)) for filename in os.listdir(trigger_job_pending_dir): trigger_job_filepath = \ os.path.join(trigger_job_pending_dir, filename) trigger_job = unpickle(trigger_job_filepath, logger) serverjob_filepath = \ os.path.join(configuration.mrsl_files_dir, os.path.join(client_id_dir(trigger_job['owner' ]), '%s.mRSL' % trigger_job['jobid'])) serverjob = unpickle(serverjob_filepath, logger) if serverjob: if serverjob['STATUS'] in pending_states: trigger_event = trigger_job['event'] trigger_rule = trigger_job['rule'] trigger_action = trigger_event['event_type'] trigger_time = time.ctime(trigger_event['time_stamp' ]) trigger_path = '%s %s' % (trigger_event['src_path' ].replace(abs_vgrid_dir, ''), trigger_event['dest_path' ].replace(abs_vgrid_dir, '')) html += \ '<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></td><td>%s</td>' \ % (trigger_job['jobid'], trigger_rule['rule_id' ], trigger_path, trigger_action, trigger_time, serverjob['STATUS']) elif serverjob['STATUS'] in final_states: src_path = os.path.join(trigger_job_pending_dir, filename) dest_path = os.path.join(trigger_job_final_dir, filename) move_file(src_path, dest_path, configuration) else: logger.error('Trigger job: %s, unknown state: %s' % (trigger_job['jobid'], serverjob['STATUS'])) html += '</tbody>' html += '</table>' output_objects.append({'object_type': 'html_form', 'text': html}) # Display active trigger jobs for this vgrid output_objects.append({'object_type': 'sectionheader', 'text': 'Trigger Log'}) log_content = read_trigger_log(configuration, vgrid_name) output_objects.append({'object_type': 'html_form', 'text': ''' <div class="form_container"> <textarea id="logarea" rows=10 readonly="readonly">%s</textarea> </div> ''' % log_content}) output_objects.append({'object_type': 'sectionheader', 'text': 'Manage Triggers'}) # Always run as rule creator to avoid users being able to act on behalf # of ANY other user using triggers (=exploit) extra_fields = [ ('path', None), ('match_dirs', ['False', 'True']), ('match_recursive', ['False', 'True']), ('changes', [keyword_all] + valid_trigger_changes), ('action', [keyword_auto] + valid_trigger_actions), ('arguments', None), ('run_as', client_id), ] # NOTE: we do NOT show saved template contents - see addvgridtriggers optional_fields = [('rate_limit', None), ('settle_time', None)] (status, oobjs) = vgrid_add_remove_table( client_id, vgrid_name, 'trigger', 'vgridtrigger', configuration, extra_fields, optional_fields, ) output_objects.extend(oobjs) if not status: return (output_objects, returnvalues.SYSTEM_ERROR) return (output_objects, returnvalues.OK)
def create_monitor(vgrid_name): """Write monitor HTML file for vgrid_name""" html_file = os.path.join(configuration.vgrid_home, vgrid_name, '%s.html' % configuration.vgrid_monitor) print 'collecting statistics for VGrid %s' % vgrid_name sleep_secs = configuration.sleep_secs slackperiod = configuration.slackperiod now = time.asctime(time.localtime()) html_vars = { 'sleep_secs': sleep_secs, 'vgrid_name': vgrid_name, 'logo_url': '/images/logo.jpg', 'now': now, 'short_title': configuration.short_title, } monitor_meta = '''<meta http-equiv="refresh" content="%(sleep_secs)s" /> ''' % html_vars add_import = ''' <script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script> ''' add_init = '' add_ready = ''' // table initially sorted by col. 1 (name) var sortOrder = [[1,0]]; // use image path for sorting if there is any inside var imgTitle = function(contents) { var key = $(contents).find("a").attr("class"); if (key == null) { key = $(contents).html(); } return key; } $("table.monitor").tablesorter({widgets: ["zebra"], textExtraction: imgTitle, }); $("table.monitor").each(function () { try { $(this).trigger("sorton", [sortOrder]); } catch(err) { /* tablesorter chokes on empty tables - just continue */ } }); ''' monitor_js = ''' %s <script type="text/javascript" > %s $(document).ready(function() { %s } ); </script> ''' % (add_import, add_init, add_ready) # User default site style style_helpers = themed_styles(configuration) script_helpers = themed_scripts(configuration) script_helpers['advanced'] += add_import script_helpers['init'] += add_init script_helpers['ready'] += add_ready html = get_xgi_html_header( configuration, '%(short_title)s Monitor, VGrid %(vgrid_name)s' % html_vars, '', html=True, meta=monitor_meta, style_map=style_helpers, script_map=script_helpers, frame=False, menu=False, widgets=False, userstyle=False, ) html += \ ''' <!-- end of raw header: this line is used by showvgridmonitor --> <h1>Statistics/monitor for the %(vgrid_name)s VGrid</h1> <div class="generatornote smallcontent"> This page was generated %(now)s (automatic refresh every %(sleep_secs)s secs). </div> '''\ % html_vars # loop and get totals parse_count = 0 queued_count = 0 frozen_count = 0 executing_count = 0 finished_count = 0 failed_count = 0 retry_count = 0 canceled_count = 0 cpucount_requested = 0 cpucount_done = 0 nodecount_requested = 0 nodecount_done = 0 cputime_requested = 0 cputime_done = 0 used_walltime = 0 disk_requested = 0 disk_done = 0 memory_requested = 0 memory_done = 0 runtimeenv_dict = {'': 0} runtimeenv_requested = 0 runtimeenv_done = 0 number_of_jobs = 0 up_count = 0 down_count = 0 slack_count = 0 job_assigned = 0 job_assigned_cpus = 0 gstat = GridStat(configuration, logger) runtimeenv_dict = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RUNTIMEENVIRONMENT', {}) parse_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'PARSE') queued_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'QUEUED') frozen_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FROZEN') executing_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'EXECUTING') failed_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FAILED') retry_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RETRY') canceled_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CANCELED') expired_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'EXPIRED') finished_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'FINISHED') nodecount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'NODECOUNT_REQ') nodecount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'NODECOUNT_DONE') cputime_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUTIME_REQ') cputime_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUTIME_DONE') used_walltime = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'USED_WALLTIME') if (used_walltime == 0): used_walltime = datetime.timedelta(0) used_walltime = format_timedelta(used_walltime) disk_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'DISK_REQ') disk_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'DISK_DONE') memory_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'MEMORY_REQ') memory_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'MEMORY_DONE') cpucount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUCOUNT_REQ') cpucount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'CPUCOUNT_DONE') runtimeenv_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RUNTIMEENVIRONMENT_REQ') runtimeenv_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), 'RUNTIMEENVIRONMENT_DONE') number_of_jobs = parse_count number_of_jobs += queued_count number_of_jobs += frozen_count number_of_jobs += expired_count number_of_jobs += canceled_count number_of_jobs += failed_count number_of_jobs += executing_count number_of_jobs += finished_count number_of_jobs += retry_count html_vars = { 'parse_count': parse_count, 'queued_count': queued_count, 'frozen_count': frozen_count, 'executing_count': executing_count, 'failed_count': failed_count, 'retry_count': retry_count, 'canceled_count': canceled_count, 'expired_count': expired_count, 'finished_count': finished_count, 'number_of_jobs': number_of_jobs, 'cpucount_requested': cpucount_requested, 'cpucount_done': cpucount_done, 'nodecount_requested': nodecount_requested, 'nodecount_done': nodecount_done, 'cputime_requested': cputime_requested, 'cputime_done': cputime_done, 'used_walltime': used_walltime, 'disk_requested': disk_requested, 'disk_done': disk_done, 'memory_requested': memory_requested, 'memory_done': memory_done, 'runtimeenv_requested': runtimeenv_requested, 'runtimeenv_done': runtimeenv_done, } html += \ """<h2>Job Stats</h2><table class=monitorstats><tr><td> <table class=monitorjobs><tr class=title><td>Job State</td><td>Number of jobs</td></tr> <tr><td>Parse</td><td>%(parse_count)s</td></tr> <tr><td>Queued</td><td>%(queued_count)s</td></tr> <tr><td>Frozen</td><td>%(frozen_count)s</td></tr> <tr><td>Executing</td><td>%(executing_count)s</td></tr> <tr><td>Failed</td><td>%(failed_count)s</td></tr> <tr><td>Retry</td><td>%(retry_count)s</td></tr> <tr><td>Canceled</td><td>%(canceled_count)s</td></tr> <tr><td>Expired</td><td>%(expired_count)s</td></tr> <tr><td>Finished</td><td>%(finished_count)s</td></tr> <tr><td>Total</td><td>%(number_of_jobs)s</td></tr> </table> </td><td> <table class=monitorresreq> <tr class=title><td>Requirement</td><td>Requested</td><td>Done</td></tr> <tr><td>Cpucount</td><td>%(cpucount_requested)s</td><td>%(cpucount_done)s</td></tr> <tr><td>Nodecount</td><td>%(nodecount_requested)s</td><td>%(nodecount_done)s</td></tr> <tr><td>Cputime</td><td>%(cputime_requested)s</td><td>%(cputime_done)s</td></tr> <tr><td>GB Disk</td><td>%(disk_requested)s</td><td>%(disk_done)s</td></tr> <tr><td>MB Memory</td><td>%(memory_requested)s</td><td>%(memory_done)s</td></tr> <tr><td>Runtime Envs</td><td>%(runtimeenv_requested)s</td><td>%(runtimeenv_done)s</td></tr> <tr><td>Used Walltime</td><td colspan='2'>%(used_walltime)s</td></tr> </table><br /> </td><td> <div class=monitorruntimeenvdetails> <table class=monitorruntimeenvdone> <tr class=title><td>Runtime Envs Done</td><td></td></tr> """\ % html_vars if len(runtimeenv_dict.keys()) < 1: # No runtimeenv requests html += '<tr><td></td><td>-</td></tr>\n' else: for entry in runtimeenv_dict.keys(): if not entry == '': html += '<tr><td>' + entry + '</td><td>'\ + str(runtimeenv_dict[entry]) + '</td></tr>\n' total_number_of_exe_resources, total_number_of_store_resources = 0, 0 total_number_of_exe_cpus, total_number_of_store_gigs = 0, 0 vgrid_name_list = vgrid_name.split('/') current_dir = '' exes, stores = '', '' for vgrid_name_part in vgrid_name_list: current_dir = os.path.join(current_dir, vgrid_name_part) abs_mon_dir = os.path.join(configuration.vgrid_home, current_dir) # print 'dir: %s' % abs_mon_dir # Potential race - just ignore if it disappeared try: sorted_names = os.listdir(abs_mon_dir) except OSError: continue sorted_names.sort() for filename in sorted_names: # print filename if filename.startswith('monitor_last_request_'): # read last request helper file mon_file_name = os.path.join(abs_mon_dir, filename) print 'found ' + mon_file_name last_request_dict = unpickle(mon_file_name, logger) if not last_request_dict: print 'could not open and unpickle: '\ + mon_file_name continue if not last_request_dict.has_key('CREATED_TIME'): print 'skip broken last request dict: '\ + mon_file_name continue difference = datetime.datetime.now()\ - last_request_dict['CREATED_TIME'] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) last_timetuple = last_request_dict['CREATED_TIME'].timetuple() if last_request_dict.has_key('CPUTIME'): cputime = last_request_dict['CPUTIME'] elif last_request_dict.has_key('cputime'): cputime = last_request_dict['cputime'] else: print 'ERROR: last request does not contain cputime field!: %s'\ % last_request_dict continue try: cpusec = int(cputime) except ValueError: try: cpusec = int(float(cputime)) except ValueError, verr: print 'ERROR: failed to parse cputime %s: %s'\ % (cputime, verr) # Include execution delay guesstimate for strict fill # LRMS resources try: delay = int(last_request_dict['EXECUTION_DELAY']) except KeyError: delay = 0 except ValueError: delay = 0 time_remaining = (last_request_dict['CREATED_TIME'] + datetime.timedelta(seconds=cpusec) + datetime.timedelta(seconds=delay))\ - datetime.datetime.now() days_rem = str(time_remaining.days) hours_rem = str(time_remaining.seconds / 3600) minutes_rem = str((time_remaining.seconds % 3600) / 60) seconds_rem = str((time_remaining.seconds % 60) % 60) if time_remaining.days < -7: try: print 'removing: %s as we havent seen him for %s days.'\ % (mon_file_name, abs(time_remaining).days) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s"\ % (mon_file_name, str(err)) pass else: unique_res_name_and_exe_list = \ filename.split('monitor_last_request_', 1) if cpusec == 0: resource_status = 'unavailable' elif time_remaining.days < 0: # time_remaining.days < 0 means that we have passed the specified time time_rem_abs = abs(time_remaining) if time_rem_abs.days == 0\ and int(time_rem_abs.seconds)\ < int(slackperiod): resource_status = 'slack' slack_count = slack_count + 1 else: resource_status = 'offline' down_count = down_count + 1 else: resource_status = 'online' up_count = up_count + 1 exes += '<tr>' exes += \ '<td><img src=/images/status-icons/%s.png /></td>'\ % resource_status public_id = unique_res_name_and_exe_list[1] if last_request_dict['RESOURCE_CONFIG'].get( 'ANONYMOUS', True): public_id = anon_resource_id(public_id) public_name = last_request_dict['RESOURCE_CONFIG'].get( 'PUBLICNAME', '') resource_parts = public_id.split('_', 2) resource_name = "<a href='viewres.py?unique_resource_name=%s'>%s</a>" % \ (resource_parts[0], resource_parts[0]) if public_name: resource_name += "<br />(alias %s)" % public_name else: resource_name += "<br />(no alias)" resource_name += "<br />%s" % resource_parts[1] exes += '<td>%s</td>' % resource_name last_asctime = time.asctime(last_timetuple) last_epoch = time.mktime(last_timetuple) exes += '<td><div class="sortkey">%s</div>%s<br />' % \ (last_epoch, last_asctime) exes += '(%sd %sh %sm %ss ago)</td>' % (days, hours, minutes, seconds) exes += '<td>' + vgrid_name + '</td>' runtime_envs = last_request_dict['RESOURCE_CONFIG'][ 'RUNTIMEENVIRONMENT'] runtime_envs.sort() re_list_text = ', '.join([i[0] for i in runtime_envs]) exes += '<td title="%s">' % re_list_text \ + str(len(runtime_envs)) + '</td>' exes += '<td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['CPUTIME']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['NODECOUNT']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['CPUCOUNT']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['DISK']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['MEMORY']) + '</td><td>'\ + str(last_request_dict['RESOURCE_CONFIG' ]['ARCHITECTURE']) + '</td>' exes += '<td>' + last_request_dict['STATUS']\ + '</td><td>' + str(last_request_dict['CPUTIME' ]) + '</td>' exes += '<td class=status_%s>' % resource_status if 'unavailable' == resource_status: exes += '-' elif 'slack' == resource_status: exes += 'Within slack period (%s < %s secs)'\ % (time_rem_abs.seconds, slackperiod) elif 'offline' == resource_status: exes += 'down?' else: exes += '%sd, %sh, %sm, %ss'\ % (days_rem, hours_rem, minutes_rem, seconds_rem) exes += '</td>' exes += '</tr>\n' if last_request_dict['STATUS'] == 'Job assigned': job_assigned = job_assigned + 1 job_assigned_cpus = job_assigned_cpus\ + int(last_request_dict['RESOURCE_CONFIG' ]['NODECOUNT'])\ * int(last_request_dict['RESOURCE_CONFIG' ]['CPUCOUNT']) total_number_of_exe_resources += 1 total_number_of_exe_cpus += int( last_request_dict['RESOURCE_CONFIG']['NODECOUNT']) \ * int(last_request_dict['RESOURCE_CONFIG']['CPUCOUNT']) elif filename.startswith('monitor_last_status_'): # store must be linked to this vgrid, not only parent vgrid: # inheritance only covers access, not automatic participation if current_dir != vgrid_name: continue # read last resource action status file mon_file_name = os.path.join(abs_mon_dir, filename) print 'found ' + mon_file_name last_status_dict = unpickle(mon_file_name, logger) if not last_status_dict: print 'could not open and unpickle: '\ + mon_file_name continue if not last_status_dict.has_key('CREATED_TIME'): print 'skip broken last request dict: '\ + mon_file_name continue difference = datetime.datetime.now()\ - last_status_dict['CREATED_TIME'] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) if last_status_dict['STATUS'] == 'stopped': time_stopped = datetime.datetime.now() - \ last_status_dict['CREATED_TIME'] if time_stopped.days > 7: try: print 'removing: %s as we havent seen him for %s days.'\ % (mon_file_name, abs(time_stopped).days) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s"\ % (mon_file_name, str(err)) continue
def check_mrsl_files( configuration, job_queue, executing_queue, only_new, logger, ): """Check job files on disk in order to initialize job queue after (re)start of grid_script. """ # We only check files modified since last start if possible last_start = 0 last_start_file = os.path.join(configuration.mig_system_files, 'grid_script_laststart') if os.path.exists(last_start_file): last_start = os.path.getmtime(last_start_file) check_mrsl_files_start_time = time.time() # TODO: switch to listdir or glob? all files are in mrsl_files_dir/*/*.mRSL for (root, _, files) in os.walk(configuration.mrsl_files_dir): # skip all dot dirs - they are from repos etc and _not_ jobs if root.find(os.sep + '.') != -1: continue # skip all dirs without any recent changes if only_new and os.path.getmtime(root) < last_start: logger.info('check mRSL files: skipping unchanged dir: %s' % root) continue logger.info('check mRSL files: inspecting %d files in %s' % \ (len(files), root)) file_count = 0 for name in files: filename = os.path.join(root, name) file_count += 1 if file_count % 1000 == 0: logger.info('check mRSL files: %d files in %s checked' % \ (file_count, root)) if os.path.getmtime(filename) < last_start: if only_new: #logger.debug('skipping treated mrsl file: %s' # % filename) continue logger.info('parsing possibly outdated mrsl file: %s' % filename) job_dict = io.unpickle(filename, logger) if not job_dict: logger.error('could not open and unpickle: %s' % filename) continue if job_dict['STATUS'] == 'PARSE': # parse is ok, since mRSL file exists # tell 'grid_script' and let grid_script put it into the queue logger.info('Found a file with PARSE status: %s' % job_dict['JOB_ID']) job_id = job_dict['JOB_ID'] client_id = job_dict['USER_CERT'] client_dir = client_id_dir(client_id) message = 'USERJOBFILE %s/%s\n' % (client_dir, job_id) if not send_message_to_grid_script(message, logger, configuration): print 'Fatal error: Could not write to grid stdin' elif job_dict['STATUS'] == 'QUEUED'\ and not job_queue.get_job_by_id(job_dict['JOB_ID']): # put in job queue logger.info('USERJOBFILE: There were %s jobs in the job_queue' % job_queue.queue_length()) job_queue.enqueue_job(job_dict, job_queue.queue_length()) logger.info("Now there's %s (QUEUED job %s added)" % (job_queue.queue_length(), job_dict['JOB_ID'])) elif job_dict['STATUS'] == 'EXECUTING'\ and not executing_queue.get_job_by_id(job_dict['JOB_ID' ]): # put in executing queue logger.info('USERJOBFILE: There were %s jobs in the executing_queue' % executing_queue.queue_length()) executing_queue.enqueue_job(job_dict, executing_queue.queue_length()) logger.info("Now there's %s (EXECUTING job %s added)" % (executing_queue.queue_length(), job_dict['JOB_ID'])) else: # logger.debug('Job in %s is already treated' % filename) continue # update last_start_file access times. Note the timestamp is not "now" but # when check_mrsl_files was called to avoid loosing any jobs being parsed # at the same time as this function is running. logger.info('setting time of last_start_file %s to %s' % (last_start_file, check_mrsl_files_start_time)) io.touch(last_start_file, check_mrsl_files_start_time) check_mrsl_files_end_time = time.time() logger.info('finished checking for mRSL files in %fs' % \ (check_mrsl_files_end_time-check_mrsl_files_start_time))
def vms_list(client_id, configuration): """Returns a list of dicts describing available user virtual machines described by the keys from default_vm_specs and the additional fields: 'name', 'status', 'uuid', 'execution_time' and 'path' NOTE: The current state management does not fully exploit the powers of the grid, it only allows one 'instance' of the virtual machine to be running. But in practice a user could fairly use multiple instances of the same virtual machine. Using this basic model is feasible since the job submission is controlled via the web interface. It will however break if the user manually submits her own job. Currently two ways of deploying machines to resources exist: - By using VirtualBox frontend (work of Tomas) - By using webinterface (work of Simon) The storage of virtual machines are based on xml files (deployed by virtualbox) or ini files when deployed by MiG. This library supports both and the logic is separated into functions where appropriate. """ # Grab the base directory of the user client_dir = client_id_dir(client_id) user_home = os.path.abspath(os.path.join(configuration.user_home, client_dir)) mrsl_files_dir = os.path.abspath(os.path.join(configuration.mrsl_files_dir, client_dir)) # Append the virtual machine directory vms_paths = glob(os.path.join(user_home, vm_base, '*', '*.cfg')) # List of virtual machines vms = [] for vm_def_path in vms_paths: machine = {} machine_defaults = default_vm_specs(configuration) machine_state = { 'name': 'UNKNOWN', 'path': os.path.abspath(vm_def_path), 'status': 'UNKNOWN', 'execution_time': 'UNKNOWN', 'job_id': 'UNKNOWN', 'uuid': 'UNKNOWN', } machine.update(machine_defaults) machine.update(machine_state) # Grab the configuration file defining the machine vm_def_base = os.path.basename(os.path.dirname(vm_def_path)) vm_config = ConfigParser.ConfigParser() vm_config.read([vm_def_path]) machine['name'] = vm_def_base # override defaults with conf values for key in machine_defaults.keys(): if vm_config.has_option('MiG', key): machine[key] = vm_config.get('MiG', key) # vgrid entry must be a list of strings if isinstance(machine['vgrid'], basestring): machine['vgrid'] = machine['vgrid'].split() # All job descriptions associated with this virtual machine jobs = [] match_line = "$VBOXMANAGE -q createvm --name '" + vm_def_base \ + "' --register" # we cannot inspect all mrsl files - filter by year is good guesstimate # TODO: mark vms jobs for easy finding without brute force search for mrsl_path in glob(os.path.join(mrsl_files_dir, '*_%d_*' % \ datetime.date.today().year)): for line in open(os.path.abspath(mrsl_path), 'r', 1): if match_line in line: jobs.append(unpickle(mrsl_path, configuration.logger)) # Base the state on the latest job. # # Now determine the state of the jobs. # Job status can be one of EXECUTING, CANCELED, FAILED, QUEUED, # FINISHED, the machine state mapping is: # EXECUTING -> Powered On # CANCELED/FAILED/FINISHED -> Powered Off # QUEUED -> Booting # # TODO: 3 if len(jobs) > 0: sorted_jobs = sorted(jobs, key=operator.itemgetter('JOB_ID')) last = sorted_jobs[-1] machine['status'] = last['STATUS'] if machine['status'] == 'EXECUTING': machine['execution_time'] = last['EXECUTING_TIMESTAMP'] machine['job_id'] = last['JOB_ID'] vms.append(machine) return vms
return (False, '''Fatal error: Could not get exclusive access or write to %s''' % configuration.grid_stdin) if forceddestination and forceddestination.has_key('RE_NAME'): # add job_id to runtime environment verification history unique_resource_name = forceddestination['UNIQUE_RESOURCE_NAME'] re_name = forceddestination['RE_NAME'] resource_config_filename = configuration.resource_home\ + unique_resource_name + '/config' # open resource config resource_config = unpickle(resource_config_filename, logger) if not resource_config: logger.error('error unpickling resource config') return False dict_entry = (job_id, client_id) # add entry to runtime verification history if not resource_config.has_key('RUNTVERIFICATION'): resource_config['RUNTVERIFICATION'] = \ {re_name: [dict_entry]} else: before_runt_dict = resource_config['RUNTVERIFICATION'] if not before_runt_dict.has_key(re_name): before_runt_dict[re_name] = [].append(dict_entry)
def main(client_id, user_arguments_dict): """Main function used by front end""" (configuration, logger, output_objects, op_name) = \ initialize_main_variables(client_id, op_header=False) client_dir = client_id_dir(client_id) defaults = signature()[1] (validate_status, accepted) = validate_input_and_cert( user_arguments_dict, defaults, output_objects, client_id, configuration, allow_rejects=False, ) if not validate_status: return (accepted, returnvalues.CLIENT_ERROR) logger.debug("User: %s executing %s" % (client_id, op_name)) if not configuration.site_enable_jupyter: output_objects.append({ 'object_type': 'error_text', 'text': 'The Jupyter service is not enabled on the system' }) return (output_objects, returnvalues.SYSTEM_ERROR) if not configuration.site_enable_sftp_subsys and not \ configuration.site_enable_sftp: output_objects.append({ 'object_type': 'error_text', 'text': 'The required sftp service is not enabled on the system' }) return (output_objects, returnvalues.SYSTEM_ERROR) if configuration.site_enable_sftp: sftp_port = configuration.user_sftp_port if configuration.site_enable_sftp_subsys: sftp_port = configuration.user_sftp_subsys_port requested_service = accepted['service'][-1] service = { k: v for options in configuration.jupyter_services for k, v in options.items() if options['service_name'] == requested_service } if not service: valid_services = [ options['name'] for options in configuration.jupyter_services ] output_objects.append({ 'object_type': 'error_text', 'text': '%s is not a valid jupyter service, ' 'allowed include %s' % (requested_service, valid_services) }) return (output_objects, returnvalues.SYSTEM_ERROR) valid_service = valid_jupyter_service(configuration, service) if not valid_service: output_objects.append({ 'object_type': 'error_text', 'text': 'The service %s appears to be misconfigured, ' 'please contact a system administrator about this issue' % requested_service }) return (output_objects, returnvalues.SYSTEM_ERROR) host = get_host_from_service(configuration, service) # Get an active jupyterhost if host is None: logger.error("No active jupyterhub host could be found") output_objects.append({ 'object_type': 'error_text', 'text': 'Failed to establish connection to the %s Jupyter service' % service['service_name'] }) output_objects.append({ 'object_type': 'link', 'destination': 'jupyter.py', 'text': 'Back to Jupyter services overview' }) return (output_objects, returnvalues.SYSTEM_ERROR) remote_user = unescape(os.environ.get('REMOTE_USER', '')).strip() if not remote_user: logger.error("Can't connect to jupyter with an empty REMOTE_USER " "environment variable") output_objects.append({ 'object_type': 'error_text', 'text': 'Failed to establish connection to the Jupyter service' }) return (output_objects, returnvalues.CLIENT_ERROR) # Ensure the remote_user dict can be http posted remote_user = str(remote_user) # TODO, activate admin info # remote_user = {'USER': username, 'IS_ADMIN': is_admin(client_id, # configuration, # logger)} # Regular sftp path mnt_path = os.path.join(configuration.jupyter_mount_files_dir, client_dir) # Subsys sftp path subsys_path = os.path.join(configuration.mig_system_files, 'jupyter_mount') # sftp session path link_home = configuration.sessid_to_jupyter_mount_link_home user_home_dir = os.path.join(configuration.user_home, client_dir) # Preparing prerequisites if not os.path.exists(mnt_path): os.makedirs(mnt_path) if not os.path.exists(link_home): os.makedirs(link_home) if configuration.site_enable_sftp_subsys: if not os.path.exists(subsys_path): os.makedirs(subsys_path) # Make sure ssh daemon does not complain tighten_key_perms(configuration, client_id) url_base = '/' + service['service_name'] url_home = url_base + '/home' url_auth = host + url_base + '/hub/login' url_data = host + url_base + '/hub/user-data' # Does the client home dir contain an active mount key # If so just keep on using it. jupyter_mount_files = [ os.path.join(mnt_path, jfile) for jfile in os.listdir(mnt_path) if jfile.endswith('.jupyter_mount') ] logger.info("User: %s mount files: %s" % (client_id, "\n".join(jupyter_mount_files))) logger.debug("Remote-User %s" % remote_user) active_mounts = [] for jfile in jupyter_mount_files: jupyter_dict = unpickle(jfile, logger) if not jupyter_dict: # Remove failed unpickle logger.error("Failed to unpickle %s removing it" % jfile) remove_jupyter_mount(jfile, configuration) else: # Mount has been timed out if not is_active(jupyter_dict): remove_jupyter_mount(jfile, configuration) else: # Valid mount active_mounts.append({'path': jfile, 'state': jupyter_dict}) logger.debug( "User: %s active keys: %s" % (client_id, "\n".join([mount['path'] for mount in active_mounts]))) # If multiple are active, remove oldest active_mount, old_mounts = get_newest_mount(active_mounts) for mount in old_mounts: remove_jupyter_mount(mount['path'], configuration) # A valid active key is already present redirect straight to the jupyter # service, pass most recent mount information if active_mount is not None: mount_dict = mig_to_mount_adapt(active_mount['state']) user_dict = mig_to_user_adapt(active_mount['state']) logger.debug("Existing header values, Mount: %s User: %s" % (mount_dict, user_dict)) auth_header = {'Remote-User': remote_user} json_data = {'data': {'Mount': mount_dict, 'User': user_dict}} if configuration.site_enable_workflows: workflows_dict = mig_to_workflows_adapt(active_mount['state']) if not workflows_dict: # No cached workflows session could be found -> refresh with a # one workflow_session_id = get_workflow_session_id( configuration, client_id) if not workflow_session_id: workflow_session_id = create_workflow_session_id( configuration, client_id) # TODO get this dynamically url = configuration.migserver_https_sid_url + \ '/cgi-sid/workflowsjsoninterface.py?output_format=json' workflows_dict = { 'WORKFLOWS_URL': url, 'WORKFLOWS_SESSION_ID': workflow_session_id } logger.debug("Existing header values, Workflows: %s" % workflows_dict) json_data['workflows_data'] = {'Session': workflows_dict} with requests.session() as session: # Authenticate and submit data response = session.post(url_auth, headers=auth_header) if response.status_code == 200: response = session.post(url_data, json=json_data) if response.status_code != 200: logger.error( "Jupyter: User %s failed to submit data %s to %s" % (client_id, json_data, url_data)) else: logger.error( "Jupyter: User %s failed to authenticate against %s" % (client_id, url_auth)) # Redirect client to jupyterhub return jupyter_host(configuration, output_objects, remote_user, url_home) # Create a new keyset # Create login session id session_id = generate_random_ascii(2 * session_id_bytes, charset='0123456789abcdef') # Generate private/public keys (mount_private_key, mount_public_key) = generate_ssh_rsa_key_pair(encode_utf8=True) # Known hosts sftp_addresses = socket.gethostbyname_ex( configuration.user_sftp_show_address or socket.getfqdn()) # Subsys sftp support if configuration.site_enable_sftp_subsys: # Restrict possible mount agent auth_content = [] restrict_opts = 'no-agent-forwarding,no-port-forwarding,no-pty,' restrict_opts += 'no-user-rc,no-X11-forwarding' restrictions = '%s' % restrict_opts auth_content.append('%s %s\n' % (restrictions, mount_public_key)) # Write auth file write_file('\n'.join(auth_content), os.path.join(subsys_path, session_id + '.authorized_keys'), logger, umask=027) logger.debug("User: %s - Creating a new jupyter mount keyset - " "private_key: %s public_key: %s " % (client_id, mount_private_key, mount_public_key)) jupyter_dict = { 'MOUNT_HOST': configuration.short_title, 'SESSIONID': session_id, 'USER_CERT': client_id, # don't need fraction precision, also not all systems provide fraction # precision. 'CREATED_TIMESTAMP': int(time.time()), 'MOUNTSSHPRIVATEKEY': mount_private_key, 'MOUNTSSHPUBLICKEY': mount_public_key, # Used by the jupyterhub to know which host to mount against 'TARGET_MOUNT_ADDR': "@" + sftp_addresses[0] + ":", 'PORT': sftp_port } client_email = extract_field(client_id, 'email') if client_email: jupyter_dict.update({'USER_EMAIL': client_email}) if configuration.site_enable_workflows: workflow_session_id = get_workflow_session_id(configuration, client_id) if not workflow_session_id: workflow_session_id = create_workflow_session_id( configuration, client_id) # TODO get this dynamically url = configuration.migserver_https_sid_url + \ '/cgi-sid/workflowsjsoninterface.py?output_format=json' jupyter_dict.update({ 'WORKFLOWS_URL': url, 'WORKFLOWS_SESSION_ID': workflow_session_id }) # Only post the required keys, adapt to API expectations mount_dict = mig_to_mount_adapt(jupyter_dict) user_dict = mig_to_user_adapt(jupyter_dict) workflows_dict = mig_to_workflows_adapt(jupyter_dict) logger.debug("User: %s Mount header: %s" % (client_id, mount_dict)) logger.debug("User: %s User header: %s" % (client_id, user_dict)) if workflows_dict: logger.debug("User: %s Workflows header: %s" % (client_id, workflows_dict)) # Auth and pass a new set of valid mount keys auth_header = {'Remote-User': remote_user} json_data = {'data': {'Mount': mount_dict, 'User': user_dict}} if workflows_dict: json_data['workflows_data'] = {'Session': workflows_dict} # First login with requests.session() as session: # Authenticate response = session.post(url_auth, headers=auth_header) if response.status_code == 200: response = session.post(url_data, json=json_data) if response.status_code != 200: logger.error( "Jupyter: User %s failed to submit data %s to %s" % (client_id, json_data, url_data)) else: logger.error("Jupyter: User %s failed to authenticate against %s" % (client_id, url_auth)) # Update pickle with the new valid key jupyter_mount_state_path = os.path.join(mnt_path, session_id + '.jupyter_mount') pickle(jupyter_dict, jupyter_mount_state_path, logger) # Link jupyter pickle state file linkdest_new_jupyter_mount = os.path.join(mnt_path, session_id + '.jupyter_mount') linkloc_new_jupyter_mount = os.path.join(link_home, session_id + '.jupyter_mount') make_symlink(linkdest_new_jupyter_mount, linkloc_new_jupyter_mount, logger) # Link userhome linkloc_user_home = os.path.join(link_home, session_id) make_symlink(user_home_dir, linkloc_user_home, logger) return jupyter_host(configuration, output_objects, remote_user, url_home)
def create_monitor(vgrid_name): """Write monitor HTML file for vgrid_name""" html_file = os.path.join(configuration.vgrid_home, vgrid_name, "%s.html" % configuration.vgrid_monitor) print "collecting statistics for VGrid %s" % vgrid_name sleep_secs = configuration.sleep_secs slackperiod = configuration.slackperiod now = time.asctime(time.localtime()) html_vars = { "sleep_secs": sleep_secs, "vgrid_name": vgrid_name, "logo_url": "/images/logo.jpg", "now": now, "short_title": configuration.short_title, } html = get_cgi_html_header( configuration, "%(short_title)s Monitor, VGrid %(vgrid_name)s" % html_vars, "", True, """<meta http-equiv="refresh" content="%(sleep_secs)s" /> """ % html_vars, themed_styles(configuration), """ <script type="text/javascript" src="/images/js/jquery.js"></script> <script type="text/javascript" src="/images/js/jquery.tablesorter.js"></script> <script type="text/javascript" > $(document).ready(function() { // table initially sorted by col. 1 (name) var sortOrder = [[1,0]]; // use image path for sorting if there is any inside var imgTitle = function(contents) { var key = $(contents).find("a").attr("class"); if (key == null) { key = $(contents).html(); } return key; } $("table.monitor").tablesorter({widgets: ["zebra"], textExtraction: imgTitle, }); $("table.monitor").each(function () { try { $(this).trigger("sorton", [sortOrder]); } catch(err) { /* tablesorter chokes on empty tables - just continue */ } }); } ); </script> """, "", False, ) html += ( """ <!-- end of raw header: this line is used by showvgridmonitor --> <h1>Statistics/monitor for the %(vgrid_name)s VGrid</h1> <div class="generatornote smallcontent"> This page was generated %(now)s (automatic refresh every %(sleep_secs)s secs). </div> """ % html_vars ) # loop and get totals parse_count = 0 queued_count = 0 frozen_count = 0 executing_count = 0 finished_count = 0 failed_count = 0 retry_count = 0 canceled_count = 0 cpucount_requested = 0 cpucount_done = 0 nodecount_requested = 0 nodecount_done = 0 cputime_requested = 0 cputime_done = 0 used_walltime = 0 disk_requested = 0 disk_done = 0 memory_requested = 0 memory_done = 0 runtimeenv_dict = {"": 0} runtimeenv_requested = 0 runtimeenv_done = 0 number_of_jobs = 0 up_count = 0 down_count = 0 slack_count = 0 job_assigned = 0 job_assigned_cpus = 0 gstat = GridStat(configuration, logger) runtimeenv_dict = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT", {}) parse_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "PARSE") queued_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "QUEUED") frozen_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FROZEN") executing_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXECUTING") failed_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FAILED") retry_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RETRY") canceled_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CANCELED") expired_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "EXPIRED") finished_count = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "FINISHED") nodecount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_REQ") nodecount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "NODECOUNT_DONE") cputime_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_REQ") cputime_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUTIME_DONE") used_walltime = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "USED_WALLTIME") if used_walltime == 0: used_walltime = datetime.timedelta(0) used_walltime = format_timedelta(used_walltime) disk_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_REQ") disk_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "DISK_DONE") memory_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_REQ") memory_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "MEMORY_DONE") cpucount_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_REQ") cpucount_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "CPUCOUNT_DONE") runtimeenv_requested = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_REQ") runtimeenv_done = gstat.get_value(gstat.VGRID, vgrid_name.upper(), "RUNTIMEENVIRONMENT_DONE") number_of_jobs = parse_count number_of_jobs += queued_count number_of_jobs += frozen_count number_of_jobs += expired_count number_of_jobs += canceled_count number_of_jobs += failed_count number_of_jobs += executing_count number_of_jobs += finished_count number_of_jobs += retry_count html_vars = { "parse_count": parse_count, "queued_count": queued_count, "frozen_count": frozen_count, "executing_count": executing_count, "failed_count": failed_count, "retry_count": retry_count, "canceled_count": canceled_count, "expired_count": expired_count, "finished_count": finished_count, "number_of_jobs": number_of_jobs, "cpucount_requested": cpucount_requested, "cpucount_done": cpucount_done, "nodecount_requested": nodecount_requested, "nodecount_done": nodecount_done, "cputime_requested": cputime_requested, "cputime_done": cputime_done, "used_walltime": used_walltime, "disk_requested": disk_requested, "disk_done": disk_done, "memory_requested": memory_requested, "memory_done": memory_done, "runtimeenv_requested": runtimeenv_requested, "runtimeenv_done": runtimeenv_done, } html += ( """<h2>Job Stats</h2><table class=monitorstats><tr><td> <table class=monitorjobs><tr class=title><td>Job State</td><td>Number of jobs</td></tr> <tr><td>Parse</td><td>%(parse_count)s</td></tr> <tr><td>Queued</td><td>%(queued_count)s</td></tr> <tr><td>Frozen</td><td>%(frozen_count)s</td></tr> <tr><td>Executing</td><td>%(executing_count)s</td></tr> <tr><td>Failed</td><td>%(failed_count)s</td></tr> <tr><td>Retry</td><td>%(retry_count)s</td></tr> <tr><td>Canceled</td><td>%(canceled_count)s</td></tr> <tr><td>Expired</td><td>%(expired_count)s</td></tr> <tr><td>Finished</td><td>%(finished_count)s</td></tr> <tr><td>Total</td><td>%(number_of_jobs)s</td></tr> </table> </td><td> <table class=monitorresreq> <tr class=title><td>Requirement</td><td>Requested</td><td>Done</td></tr> <tr><td>Cpucount</td><td>%(cpucount_requested)s</td><td>%(cpucount_done)s</td></tr> <tr><td>Nodecount</td><td>%(nodecount_requested)s</td><td>%(nodecount_done)s</td></tr> <tr><td>Cputime</td><td>%(cputime_requested)s</td><td>%(cputime_done)s</td></tr> <tr><td>GB Disk</td><td>%(disk_requested)s</td><td>%(disk_done)s</td></tr> <tr><td>MB Memory</td><td>%(memory_requested)s</td><td>%(memory_done)s</td></tr> <tr><td>Runtime Envs</td><td>%(runtimeenv_requested)s</td><td>%(runtimeenv_done)s</td></tr> <tr><td>Used Walltime</td><td colspan='2'>%(used_walltime)s</td></tr> </table><br /> </td><td> <div class=monitorruntimeenvdetails> <table class=monitorruntimeenvdone> <tr class=title><td>Runtime Envs Done</td><td></td></tr> """ % html_vars ) if len(runtimeenv_dict.keys()) < 1: # No runtimeenv requests html += "<tr><td></td><td>-</td></tr>\n" else: for entry in runtimeenv_dict.keys(): if not entry == "": html += "<tr><td>" + entry + "</td><td>" + str(runtimeenv_dict[entry]) + "</td></tr>\n" total_number_of_exe_resources, total_number_of_store_resources = 0, 0 total_number_of_exe_cpus, total_number_of_store_gigs = 0, 0 vgrid_name_list = vgrid_name.split("/") current_dir = "" exes, stores = "", "" for vgrid_name_part in vgrid_name_list: current_dir = os.path.join(current_dir, vgrid_name_part) abs_mon_dir = os.path.join(configuration.vgrid_home, current_dir) # print 'dir: %s' % abs_mon_dir # Potential race - just ignore if it disappeared try: sorted_names = os.listdir(abs_mon_dir) except OSError: continue sorted_names.sort() for filename in sorted_names: # print filename if filename.startswith("monitor_last_request_"): # read last request helper file mon_file_name = os.path.join(abs_mon_dir, filename) print "found " + mon_file_name last_request_dict = unpickle(mon_file_name, logger) if not last_request_dict: print "could not open and unpickle: " + mon_file_name continue difference = datetime.datetime.now() - last_request_dict["CREATED_TIME"] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) if last_request_dict.has_key("CPUTIME"): cputime = last_request_dict["CPUTIME"] elif last_request_dict.has_key("cputime"): cputime = last_request_dict["cputime"] else: print "ERROR: last request does not contain cputime field!: %s" % last_request_dict continue try: cpusec = int(cputime) except ValueError: try: cpusec = int(float(cputime)) except ValueError, verr: print "ERROR: failed to parse cputime %s: %s" % (cputime, verr) # Include execution delay guesstimate for strict fill # LRMS resources try: delay = int(last_request_dict["EXECUTION_DELAY"]) except KeyError: delay = 0 except ValueError: delay = 0 time_remaining = ( last_request_dict["CREATED_TIME"] + datetime.timedelta(seconds=cpusec) + datetime.timedelta(seconds=delay) ) - datetime.datetime.now() days_rem = str(time_remaining.days) hours_rem = str(time_remaining.seconds / 3600) minutes_rem = str((time_remaining.seconds % 3600) / 60) seconds_rem = str((time_remaining.seconds % 60) % 60) if time_remaining.days < -7: try: print "removing: %s as we havent seen him for %s days." % ( mon_file_name, abs(time_remaining).days, ) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s" % (mon_file_name, str(err)) pass else: unique_res_name_and_exe_list = filename.split("monitor_last_request_", 1) if cpusec == 0: resource_status = "unavailable" elif time_remaining.days < 0: # time_remaining.days < 0 means that we have passed the specified time time_rem_abs = abs(time_remaining) if time_rem_abs.days == 0 and int(time_rem_abs.seconds) < int(slackperiod): resource_status = "slack" slack_count = slack_count + 1 else: resource_status = "offline" down_count = down_count + 1 else: resource_status = "online" up_count = up_count + 1 exes += "<tr>" exes += "<td><img src=/images/status-icons/%s.png /></td>" % resource_status public_id = unique_res_name_and_exe_list[1] if last_request_dict["RESOURCE_CONFIG"].get("ANONYMOUS", True): public_id = anon_resource_id(public_id) public_name = last_request_dict["RESOURCE_CONFIG"].get("PUBLICNAME", "") resource_parts = public_id.split("_", 2) resource_name = "<a href='viewres.py?unique_resource_name=%s'>%s</a>" % ( resource_parts[0], resource_parts[0], ) if public_name: resource_name += "<br />(alias %s)" % public_name else: resource_name += "<br />(no alias)" resource_name += "<br />%s" % resource_parts[1] exes += "<td>%s</td>" % resource_name exes += "<td>%s<br />(%sd %sh %sm %ss ago)</td>" % ( time.asctime(last_request_dict["CREATED_TIME"].timetuple()), days, hours, minutes, seconds, ) exes += "<td>" + vgrid_name + "</td>" runtime_envs = last_request_dict["RESOURCE_CONFIG"]["RUNTIMEENVIRONMENT"] re_list_text = ", ".join([i[0] for i in runtime_envs]) exes += '<td title="%s">' % re_list_text + str(len(runtime_envs)) + "</td>" exes += ( "<td>" + str(last_request_dict["RESOURCE_CONFIG"]["CPUTIME"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["DISK"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["MEMORY"]) + "</td><td>" + str(last_request_dict["RESOURCE_CONFIG"]["ARCHITECTURE"]) + "</td>" ) exes += ( "<td>" + last_request_dict["STATUS"] + "</td><td>" + str(last_request_dict["CPUTIME"]) + "</td>" ) exes += "<td class=status_%s>" % resource_status if "unavailable" == resource_status: exes += "-" elif "slack" == resource_status: exes += "Within slack period (%s < %s secs)" % (time_rem_abs.seconds, slackperiod) elif "offline" == resource_status: exes += "down?" else: exes += "%sd, %sh, %sm, %ss" % (days_rem, hours_rem, minutes_rem, seconds_rem) exes += "</td>" exes += "</tr>\n" if last_request_dict["STATUS"] == "Job assigned": job_assigned = job_assigned + 1 job_assigned_cpus = job_assigned_cpus + int( last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"] ) * int(last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"]) total_number_of_exe_resources += 1 total_number_of_exe_cpus += int(last_request_dict["RESOURCE_CONFIG"]["NODECOUNT"]) * int( last_request_dict["RESOURCE_CONFIG"]["CPUCOUNT"] ) elif filename.startswith("monitor_last_status_"): # store must be linked to this vgrid, not only parent vgrid: # inheritance only covers access, not automatic participation if current_dir != vgrid_name: continue # read last resource action status file mon_file_name = os.path.join(abs_mon_dir, filename) print "found " + mon_file_name last_status_dict = unpickle(mon_file_name, logger) if not last_status_dict: print "could not open and unpickle: " + mon_file_name continue difference = datetime.datetime.now() - last_status_dict["CREATED_TIME"] days = str(difference.days) hours = str(difference.seconds / 3600) minutes = str((difference.seconds % 3600) / 60) seconds = str((difference.seconds % 60) % 60) if last_status_dict["STATUS"] == "stopped": time_stopped = datetime.datetime.now() - last_status_dict["CREATED_TIME"] if time_stopped.days > 7: try: print "removing: %s as we havent seen him for %s days." % ( mon_file_name, abs(time_stopped).days, ) os.remove(mon_file_name) except Exception, err: print "could not remove: '%s' Error: %s" % (mon_file_name, str(err)) continue
def set_user_display_active( client_id, display_number, vnc_port, password, configuration, logger, ): (init_ret, filename) = \ initialize_and_get_display_dict_filename(configuration, logger) if not init_ret: return (False, 'could not initialize') (dis_ret, dis_dict) = get_dict_from_display_number(display_number, configuration, logger) if not dis_ret: return (False, 'dict error, %s' % dis_dict) if dis_dict != -1: if dis_dict['client_id'] != client_id: # display occupied by another user! return (False, 'display %s already in use by another user!' % display_number) # getting here means display is free or used by client_id dict = unpickle(filename, logger) if dict == False: return (False, 'could not unpickle %s' % filename) current_display = get_users_display_number(client_id, configuration, logger) if not current_display: # register display dict[display_number] = {'client_id': client_id, 'vnc_port': vnc_port, 'password': password} pickle_status = pickle(dict, filename, logger) if not pickle_status: return (False, 'could not pickle %s when adding %s' % (filename, dict[display_number])) logger.info('successfuly registered that display %s is in use by %s in %s' % (display_number, client_id, filename)) return (True, '') if current_display != display_number and current_display != -1: # problems.. return (False, 'set_user_display_active met a conflict, can not set display %s when user already has %s registered' % (display_number, current_display)) else: # add display to dict dict[display_number] = {'client_id': client_id, 'vnc_port': vnc_port, 'password': password} pickle_status = pickle(dict, filename, logger) if not pickle_status: return (False, 'could not pickle %s when adding %s' % (filename, dict[display_number])) logger.info('successfuly registered that display %s is in use by %s in %s %s' % (display_number, client_id, dict, filename)) return (True, '')
def load_schedule_cache(path, logger): """Load schedule cache from path""" return io.unpickle(path, logger)