def __init__(self, config): self._config = config self._experiments = {} status_path = config.get_files().get_status_path() study_xml = None if status_path.exists(): study_xml = xml_load_from_path(status_path) if study_xml is None: create_study_output(config) study_xml = xml_load_from_path(status_path) # if the output xml has one element it's check, 0 nothing has ran # if there's more than that then the experiment ran and user is evaluating if study_xml is None or len(study_xml.getchildren()) <= 1: create_study_output(config) study_xml = xml_load_from_path(status_path) for exp in study_xml.xpath('//experiment'): # keep track of experiment's name exp_name = Files.get_exp_name(exp.attrib['count']) self._experiments[exp_name] = ExperimentData(exp) model = single_xpath(study_xml, '/study/config/splitter/model') if model.text == 'kcv': self._kcv_count = int(model.attrib['count']) else: self._kcv_count = 0 time = single_xpath(study_xml, 'completed_at').text time_obj = datetime.strptime(time, '%Y-%m-%d %H:%M:%S.%f') self._timestamp = time_obj.strftime("%Y%m%d_%H%M%S")
def process_params(self, param_elements): param_list = [] val_list = [] # Maybe could be done with an ugly list comprehension for param_xml in param_elements: name_elem = single_xpath(param_xml, 'name') param_list.append(name_elem.text) val_elem = single_xpath(param_xml, 'value') val_list.append(val_elem.text) self._params = param_list self._vals = val_list
def fix_list_length(self): config = self._config rerank_size_elem = single_xpath( config._xml_input, '/librec-auto/rerank/script/param[@name="max_len"]') if rerank_size_elem is None: return else: list_size_elem = single_xpath(config._xml_input, "/librec-auto/metric/list-size") list_size_elem.text = rerank_size_elem.text config.write_exp_configs()
def dry_run(self, config): self._files = config.get_files() self._config = config files = config.get_files() if files.get_exp_count() > 0: for i in range(0, files.get_exp_count()): sub_paths = files.get_exp_paths(i) script_elem = single_xpath(sub_paths.get_study_conf(), '/librec-auto/rerank/script') param_spec = create_param_spec(script_elem) script_path = get_script_path(script_elem, 'rerank') ref_path = sub_paths.get_ref_exp_name() result_path = sub_paths.get_path('result') original_path = self.find_original_results(result_path, script_path, sub_paths) print( f'librec-auto (DR): Running re-ranking command {self} for {sub_paths.exp_name}' ) proc_spec = [ sys.executable, script_path.as_posix(), self._config.get_files().get_config_file_path().name, original_path.absolute().as_posix(), sub_paths.get_path('result').absolute().as_posix() ] + param_spec print_process_cli(proc_spec, str(self._config.get_files().get_study_path().absolute()))
def cross_validation(self): model_elem = single_xpath(self._xml_input, '/librec-auto/splitter/model') if model_elem.text == 'kcv': return int(model_elem.get('count')) else: return 1
def set_data_path(self, config_xml): data_dir_elem = single_xpath(config_xml, '/librec-auto/data/data-dir') if data_dir_elem is None: logging.warning( "Configuration file missing data-dir element. Assuming 'data'." ) else: self._data_dir_path = data_dir_elem.text
def load_item_features(config, data_path): item_feature_file = single_xpath( config.get_xml(), '/librec-auto/features/item-feature-file').text item_feature_path = data_path / item_feature_file if not item_feature_path.exists(): print("Cannot locate item features. Path: " + item_feature_path) return None item_feature_df = pd.read_csv(item_feature_path, names=['itemid', 'feature', 'value']) item_feature_df.set_index('itemid', inplace=True) return item_feature_df
def main(): args = read_args() config = read_config_file(args['conf'], '.') original_results_path = Path(args['original']) result_files = enumerate_results(original_results_path) dest_results_path = Path(args['result']) data_dir = single_xpath(config.get_xml(), '/librec-auto/data/data-dir').text data_path = Path(data_dir) data_path = data_path.resolve() item_feature_df = load_item_features(config, data_path) protected = single_xpath(config.get_xml(), '/librec-auto/metric/protected-feature').text if item_feature_df is None: exit(-1) alpha = float(args['alpha']) max_len = int(args['max_len']) binary = args['binary'] == 'True' # protected = str(args['protected']) helper = set_helper(alpha, max_len, binary, protected, item_feature_df) for file_path in result_files: results_df = pd.read_csv(file_path, names=['userid', 'itemid', 'rating']) fair = generate_fairstar(helper) reranked_df = rerank(results_df, fair, helper) output_reranked(reranked_df, dest_results_path, file_path)
def dry_run(self, config): self._config = config print(f'librec-auto (DR): Running post command {self}') post_elems = config.get_xml().xpath(self.POST_ELEM_XPATH) for post_elem in post_elems: param_spec = utils.create_param_spec(post_elem) if single_xpath(post_elem, "//param[@name='password']") is not None: param_spec = param_spec + ['--password=<password hidden>'] script_path = utils.get_script_path(post_elem, 'post') print(f'\tPost script: {script_path}') print(f'\tParameters: {param_spec}')
def main(): args = read_args() config = read_config_file(args['conf'], '.') original_results_path = Path(args['original']) result_files = enumerate_results(original_results_path) dest_results_path = Path(args['result']) data_dir = single_xpath(config.get_xml(), '/librec-auto/data/data-dir').text data_path = Path(data_dir) data_path = data_path.resolve() item_feature_df = load_item_features(config, data_path) if item_feature_df is None: exit(-1) # item_helper = set_item_helper(item_feature_df) # rerank_helper = set_rerank_helper(args, config, item_helper) rerank_helper = Rerank_Helper() rerank_helper.set_rerank_helper(args, config, item_feature_df) split_path = data_path / 'split' pat = re.compile(RESULT_FILE_PATTERN) method = args['method'] p = [] for file_path in result_files: p1 = multiprocessing.Process(target=execute, args=(rerank_helper, pat, file_path, split_path, dest_results_path)) p.append(p1) p1.start() for p1 in p: p1.join()
def dry_run(self, config): self._files = config.get_files() self._config = config files = config.get_files() if files.get_exp_count() > 0: for i in range(0, files.get_exp_count()): sub_path = files.get_exp_paths(i) script_elem = single_xpath(sub_path.get_study_conf(), '/librec-auto/rerank/script') param_spec = create_param_spec(script_elem) script_path = get_script_path(script_elem, 'rerank') ref_path = sub_path.get_ref_exp_name() print( f'librec-auto (DR): Running re-ranking command {self} for {sub_path.exp_name}' ) print(f'\tRe-rank script: {script_path}') print(f'\tParameters: {param_spec}') if ref_path: print(f'\tResults from: {ref_path}')
def __init__(self, sub_paths): self._subpaths = sub_paths status_path = self._subpaths.get_path('status') if status_path.exists(): self._name = sub_paths.exp_name self._status_xml = xml_load_from_path(status_path) self._message = single_xpath(self._status_xml, '/librec-auto-status/message').text if self._subpaths.get_path('log').exists(): self._log = LogFile(self._subpaths) else: self._log = None params = self._status_xml.xpath('//param') if params != None: self.process_params(params) else: self._params = [] self.m_vals = []
def dry_run(self, config): self._config = config print(f'librec-auto (DR): Running post command {self}') post_elems = config.get_xml().xpath(self.POST_ELEM_XPATH) for post_elem in post_elems: param_spec = utils.create_param_spec(post_elem) if single_xpath(post_elem, "//param[@name='password']") is not None: param_spec = param_spec + ['--password=<password hidden>'] script_path = utils.get_script_path(post_elem, 'post') proc_spec = [ sys.executable, script_path.absolute().as_posix(), self._config.get_files().get_config_file_path().name ] + param_spec print_process_cli( proc_spec, self._config.get_files().get_study_path().absolute)
def handle_password(self, post_elem, config, param_spec): if single_xpath(post_elem, "param[@name='password']") is not None: val = config.get_key_password() if val: param_spec.append(f'--password={val}') return param_spec
def setup_commands(args: dict, config: ConfigCmd): action = args['action'] purge_no_ask = args['quiet'] alg_lang = execution_platform(config, 'alg') met_lang = execution_platform(config, 'metric') # Create flags for optional steps rerank_flag = config.has_rerank() post_flag = config.has_post() # Flag to use/avoid check # if true, user specified don't run check, else, run check. no_check_flag = args['no_check'] # Set the password in the configuration if we have it if args['key_password']: config.set_key_password(args['key_password']) # Purge files (possibly) from splits and subexperiments if action == 'purge': return PurgeCmd(purge_type(args), no_ask=purge_no_ask) # Shows the status of the experiment if action == 'status': return StatusCmd() # Perform (only) post-processing on results if action == 'post' and post_flag: return PostCmd() # No post scripts available if action == 'post' and not post_flag: raise InvalidCommand( action, "No post-processing scripts available for \"post\" command") # Perform re-ranking on results, followed by evaluation and post-processing if action == 'rerank' and rerank_flag: # Runs a reranking script on the python side cmd1 = RerankCmd() cmd2 = build_librec_commands('eval', args, config) cmd3 = EvalCmd(args, config) # python-side eval cmd = SequenceCmd([cmd1, cmd2, cmd3]) bracketed_cmd = bracket_sequence('rerank', args, config, cmd) return bracketed_cmd # No re-ranker available if action == 'rerank' and not rerank_flag: raise InvalidCommand( action, "No re-ranker scripts available for \"rerank\" command.") # LibRec actions # re-run splits only if action == 'split': cmd = SequenceCmd([build_librec_commands('split', args, config)]) bracketed_cmd = bracket_sequence('split', args, config, cmd) return bracketed_cmd # re-run experiment if action == 'bbo': cmd1 = PurgeCmd('results', no_ask=purge_no_ask) cmd2 = SetupCmd(False) cmd3 = [cmd1, cmd2] if config.has_alg_script(): cmd_store = build_alg_commands(args, config, BBO=200) else: cmd_store = build_librec_commands('full', args, config, BBO=200) store_post = [PostCmd() for _ in range(len(cmd_store))] init_cmds = [cmd1, cmd2] check_cmds = [] if not no_check_flag: # check_cmds = [build_librec_commands('check',args,config), CheckCmd()] librec_check = build_librec_commands('check', args, config, BBO=200) check_cmds = [librec_check[0], CheckCmd()] exec_cmds = build_librec_commands('full', args, config, BBO=200) exec_cmds = [ SequenceCmd([exec_cmds[i]]) for i in range(len(exec_cmds)) ] if rerank_flag: # cmd.append(RerankCmd()) # cmd.append(build_exp_commands('eval', args, config)) raise UnsupportedFeatureException( "Optimization", "Optimization is not currently supported with reranking") final_cmds = [] if post_flag: final_cmds.append(PostCmd()) else: final_cmds.append(CleanupCmd()) # cmd = init_cmds + check_cmds + exec_cmds + final_cmds cmd = init_cmds + exec_cmds + final_cmds return cmd # re-run experiment and continue if (action == 'run' or action == 'show') and not config.has_alg_script(): cmd1 = build_librec_commands('full', args, config) add_eval = maybe_add_eval(config=config) if add_eval: # cmd2 = EvalCmd(args, config) # python-side eval cmd2 = build_eval_commands(args, config, met_lang) cmd = SequenceCmd([cmd1, cmd2]) else: cmd = SequenceCmd([cmd1]) if rerank_flag: cmd.add_command(RerankCmd()) cmd.add_command(build_librec_commands('eval', args, config)) # bracketed_cmd = bracket_sequence('results', args, config, cmd) bracketed_cmd = bracket_sequence('all', args, config, cmd) return bracketed_cmd if (action == 'run' or action == 'show') and config.has_alg_script(): # if met_lang == 'system': cmd1 = build_alg_commands(args, config) add_eval = maybe_add_eval(config=config) if add_eval: cmd2 = EvalCmd(args, config) # python-side eval cmd = SequenceCmd([cmd1, cmd2]) else: cmd = SequenceCmd([cmd1]) if rerank_flag: cmd.add_command(RerankCmd()) cmd.add_command(build_librec_commands('eval', args, config)) # bracketed_cmd = bracket_sequence('results', args, config, cmd) bracketed_cmd = bracket_sequence('all', args, config, cmd) return bracketed_cmd # eval-only if action == 'eval': if single_xpath(config.get_xml(), '/librec-auto/optimize') is not None: raise InvalidConfiguration( "Eval-only not currently supported with Bayesian optimization." ) # cmd1 = PurgeCmd('post', no_ask=purge_no_ask) # cmd2 = SetupCmd() cmd1 = build_librec_commands('eval', args, config) cmd2 = EvalCmd(args, config) # python-side eval cmd = SequenceCmd([cmd1, cmd2]) bracketed_cmd = bracket_sequence('post', args, config, cmd) return bracketed_cmd # check setup of experiment # We don't check on algorithm scripts if action == 'check': cmd1 = build_librec_commands('check', args, config) cmd2 = CheckCmd() cmd = SequenceCmd([cmd1, cmd2]) bracketed_cmd = bracket_sequence('none', args, config, cmd) return bracketed_cmd
def setup_bbo(self): opt_elem = single_xpath(self._xml_input, '/librec-auto/optimize') if opt_elem is None: self._bbo_steps = 0 else: self._bbo_steps = int(single_xpath(opt_elem,'iterations').text)
def has_alg_script(self): ''' Determine if <alg> element in configuration file has script. ''' alg_script_elem = single_xpath(self._xml_input, '/librec-auto/alg/script') return (alg_script_elem is not None)
def has_metric_script(self): ''' Determine if <metric> element in configuration file has script. ''' metric_script_element = single_xpath(self._xml_input, '/librec-auto/metric/script') return (metric_script_element is not None)
def execute(self, config: ConfigCmd): self._status = Cmd.STATUS_INPROC files = config.get_files() pwd = files.get_study_path() config_xml = config._xml_input config_elements = config_xml.getchildren() output_path = config.get_files().get_study_path() output_xml_path = str(output_path / "output.xml") study_ran = Path(output_xml_path).exists() check_output_xml(output_xml_path) if study_ran: os.remove(output_xml_path) # check should be the first thing writing to an output.xml file output_tree = etree.Element("study") # clear the check elements from before, if present check_element = output_tree.find('check') if check_element is not None: output_tree.remove(check_element) # check all paths have write access. for func in dir(files): if re.match(r'get_.*path$', func): getpath = getattr(files, func) if func == 'get_status_path' or func == 'get_post_path' or func == 'get_split_path': continue if not os.access(getpath(), os.W_OK): raise InvalidConfiguration( getpath(), f"Write access not granted {func}") # check all necessary elements are in config curr_elem = [e.tag for e in config_elements] necc_elem = { 'data': 'Data section', 'splitter': 'Splitter section', 'alg': 'Algorithm section', 'metric': 'Metric section' } for elem in necc_elem.keys(): if elem not in curr_elem: raise InvalidConfiguration( necc_elem[elem], f"{necc_elem[elem]} missing in configuration file.") # checking library library = single_xpath(config_xml, '/librec-auto/library') if library.attrib['src'] == "system": lib_path = files.get_lib_path() / library.text else: lib_path = pwd / library.attrib['src'] / library.text if not lib_path.exists(): raise InvalidConfiguration(lib_path, "Library not found at give path.") # Checking data. data_dir = single_xpath(config_xml, '/librec-auto/data/data-dir') # Test to see how many data directories were given. num_data_dir_test = config_xml.xpath('/librec-auto/data/data-dir') if len(num_data_dir_test) > 1: raise InvalidConfiguration("Data Directory", "More than one data file found.") # Checking path to data directory data_dir_path = Path(pwd / data_dir.text) data_file = single_xpath(config_xml, '/librec-auto/data/data-file') data_file_path = Path(data_dir_path / data_file.text) if not data_file_path.exists(): raise InvalidConfiguration(str(data_file_path), "Data file not found at given path.") # checking script paths/files exist and that scripts are in approved locations for elem in config_elements: script_element = elem.findall('script') # findall returns list, check for items. if script_element: # Iterate over scripts. for se in script_element: if se.attrib['src'] == "system": if elem.tag == 'metric': script_path = files.get_global_path( ) / 'librec_auto' / 'core' / 'cmd' / 'eval' elif elem.tag == 'post': script_path = files.get_global_path( ) / 'librec_auto' / 'core' / 'cmd' / 'post' elif elem.tag == 'rerank': script_path = files.get_global_path( ) / 'librec_auto' / 'core' / 'cmd' / 'rerank' elif elem.tag == 'alg': script_path = files.get_global_path( ) / 'librec_auto' / 'core' / 'cmd' / 'alg' else: raise InvalidConfiguration( elem.tag, f"Scripts not allowed in {elem.tag} section.") else: script_path = Path(se.attrib['src']) script_name = se.find('script-name') script_path = script_path / script_name.text if not script_path.exists(): raise InvalidConfiguration( str(script_path), f'{script_name.text} not found in given path.') # else: if there aren't script elements do nothing, for now if 'optimize' in curr_elem: alg = single_xpath(config_xml, '/librec-auto/alg') if alg is not None: for elem in alg: # parameters being optimized should have children, upper and lower if elem.getchildren(): children = [e.tag for e in elem.iterchildren()] if 'value' in children: # impossible case: librec-auto setup catches this first. raise InvalidConfiguration( 'Optimization', 'Value tags not allowed in optimize element') else: if 'lower' and 'upper' not in children: raise InvalidConfiguration( 'Optimization', f'Lower and upper tags missing in {elem.tag}' ) else: # for now continue, should add check to make sure value # from reference xml and config xml are same type. pass # create filepath attribute for errors as src # if the compiler makes it to here without raising an error, then there are no errors if not study_ran: # if the output file doesn't exist check_tree = etree.SubElement(output_tree, "check") message_element = etree.SubElement(check_tree, "message") message_element.text = "No errors found in configuration file syntax." else: # if it does check_tree = etree.Element("check") message_element = etree.SubElement(check_tree, "message") message_element.text = "No errors found in configuration file syntax." output_tree.insert(2, check_tree) # reading the Java logs # check command shouldn't care about librec.properties file not found (unless run was ran) for i in range(0, config.get_sub_exp_count()): exp_path = config.get_files().get_exp_paths(i) log_object = LogFile(exp_path, study_ran) # src: filepath check_tree = output_tree.find('check') if check_tree is not None: if len(log_object._err_msgs.keys()) != 0: # dict-list comprehension to filter out ignorable errors temp_dict = { k: [(line, m) for line, m in log_object._err_msgs[k] if not self.is_ignorable_error(m)] for k in log_object._err_msgs.keys() } # filter out empty lists temp_dict = {k: v for k, v in temp_dict.items() if v} # iterate over filtered dictionary if len(temp_dict.keys()) != 0: for error in temp_dict.keys(): for line_number, message in temp_dict[error]: message_element = etree.SubElement( check_tree, "message", { 'src': str(log_object.get_log_path()), 'logline': str(line_number), 'exp_num': str(i) }) message_element.text = message.strip('\n') else: message_element = etree.SubElement( check_tree, "message", {'src': str(log_object.get_log_path())}) message_element.text = f"No errors found in experiment {i} log." else: message_element = etree.SubElement( check_tree, "message", {'src': str(log_object.get_log_path())}) message_element.text = f"No errors found in experiment {i} log." output_tree.getroottree().write(output_xml_path, pretty_print=True) parser = etree.XMLParser(remove_blank_text=True) tree = etree.parse(output_xml_path, parser) tree.write(output_xml_path, encoding='utf-8', pretty_print=True) self._status = Cmd.STATUS_COMPLETE
def enumerate_results(result_path): files = os.listdir(result_path) pat = re.compile(RESULT_FILE_PATTERN) return [file for file in files if pat.match(file)] if __name__ == '__main__': args = read_args() #print(args) config = read_config_file(args['conf'], ".") result_files = enumerate_results(args['original']) split_path = config.get_files().get_split_path() # split_names = os.listdir(split_path) data_dir = single_xpath(config.get_xml(), '/librec-auto/data/data-dir').text item_feature_file = single_xpath( config.get_xml(), '/librec-auto/features/item-feature-file').text protected = single_xpath(config.get_xml(), '/librec-auto/metric/protected-feature').text item_feature_path = Path(data_dir) / item_feature_file item_feature_df = None if not item_feature_path.exists(): print("Cannot locate item features. Path: " + item_feature_path) exit(-1) else: item_feature_df = pd.read_csv(item_feature_path, names=['itemid', 'feature', 'value'])