Beispiel #1
0
def test_check_input_path_existance():
    package_path = ml.__path__[0].replace('python', 'src')
    res_wildcard = os.path.join(
        package_path, 'info', 'HH', '*', 'res', '*', 'info.json')
    nonRes_wildcard = os.path.join(
        package_path, 'info', 'HH', '*', 'nonRes', 'info.json')
    missing_inputPaths = []
    faulty_info = []
    for resInfo in glob.glob(res_wildcard):
        info_dict = ut.read_json_cfg(resInfo)
        tauID_wps = info_dict['tauID_training']
        for wp in tauID_wps.keys():
            input_paths = tauID_wps[wp]
            for inPath in input_paths.keys():
                if not os.path.exists(input_paths[inPath]):
                    missing_inputPaths.append(input_paths[inPath])
                    faulty_info.append(resInfo)
    for nonResInfo in glob.glob(nonRes_wildcard):
        info_dict = ut.read_json_cfg(nonResInfo)
        tauID_wps = info_dict['tauID_training']
        for wp in tauID_wps.keys():
            input_paths = tauID_wps[wp]
            for inPath in input_paths.keys():
                if not os.path.exists(input_paths[inPath]):
                    missing_inputPaths.append(input_paths[inPath])
                    faulty_info.append(nonResInfo)
    assert len(missing_inputPaths) == 0, "Missing ntuple directories: \n" \
        + str(set(missing_inputPaths)) + " in the files: \n" + str(set(faulty_info))
def main(to_continue, opt_dir, bbww):
    if not to_continue:
        settings_dir = os.path.join(
            os.path.expandvars('$CMSSW_BASE'),
            'src/machineLearning/machineLearning/settings')
        global_settings = ut.read_settings(settings_dir, 'global')
        output_dir = os.path.expandvars(global_settings['output_dir'])
    else:
        settings_dir = os.path.join(opt_dir, 'run_settings')
        global_settings = ut.read_settings(settings_dir, 'global')
        output_dir = opt_dir
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    if not to_continue:
        if not os.path.exists(os.path.join(output_dir, 'run_settings')):
            ut.save_run_settings(output_dir)
        if not os.path.exists(os.path.join(output_dir, 'run_info')):
            ut.save_info_dir(output_dir)
    # use_scratch_for_data(global_settings)
    print("::::::: Reading parameters :::::::")
    if global_settings['ml_method'] == 'xgb':
        param_file = os.path.join(settings_dir, 'xgb_parameters.json')
    else:
        param_file = os.path.join(settings_dir, 'nn_parameters.json')
    hyperparameter_info = ut.read_json_cfg(param_file)
    pso_settings = ut.read_settings(settings_dir, 'pso')
    pso_settings.update(global_settings)
    addition = ut.create_infoPath_addition(global_settings)
    channel_dir = os.path.join(output_dir, 'run_info')
    info_dir = os.path.join(channel_dir, addition)
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    normalizer = hht.HHDataNormalizer if not bbww else bbwwt.bbWWDataNormalizer
    if os.path.exists(preferences['data_csv']):
        print(':::::::: Loading data from .csv file ::::::::')
        data = pandas.read_csv(preferences['data_csv'])
    else:
        print('::::::: Loading data to be saved to pandas.DataFrame :::::::')
        if not bbww:
            loader = hht.HHDataLoader(normalizer, preferences, global_settings)
        else:
            loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings)
        data = loader.data
        if global_settings['ml_method'] in ['lbn', 'nn']:
            use_Wjets = True
            if 'bb2l' in global_settings['channel']:
                use_Wjets = False
            data = mt.multiclass_encoding(data, use_Wjets)
        loader.save_to_csv()
    print("\n============ Starting hyperparameter optimization ==========\n")
    swarm = pt.ParticleSwarm(pso_settings, st.get_fitness_score,
                             hyperparameter_info, to_continue, output_dir)
    optimal_hyperparameters = swarm.particleSwarmOptimization()[0]
    print("\n============ Saving results ================\n")
    best_parameters_path = os.path.join(output_dir,
                                        'best_hyperparameters.json')
    ut.save_dict_to_json(optimal_hyperparameters, best_parameters_path)
    print("Results saved to " + str(output_dir))
Beispiel #3
0
def test_check_info_files():
    package_path = ml.__path__[0].replace('python', 'src')
    res_wildcard = os.path.join(
        package_path, 'info', 'HH', '*', 'res', '*', 'info.json')
    nonRes_wildcard = os.path.join(
        package_path, 'info', 'HH', '*', 'nonRes', 'info.json')
    faulty_files = []
    for resInfo in glob.glob(res_wildcard):
        try:
            ut.read_json_cfg(resInfo)
        except:
            faulty_files.append(resInfo)
    for nonResInfo in glob.glob(nonRes_wildcard):
        try:
            ut.read_json_cfg(nonResInfo)
        except:
            faulty_files.append(nonResInfo)
    assert len(faulty_files) == 0, "Faulty files: " + str(faulty_files)
Beispiel #4
0
def test_check_weights_dir_existance():
    package_path = ml.__path__[0].replace('python', 'src')
    res_wildcard = os.path.join(
        package_path, 'info', 'HH', '*', 'res', '*', 'info.json')
    missing_directories = []
    for path in glob.glob(res_wildcard):
        info_dict = ut.read_json_cfg(path)
        weight_dir = info_dict['weight_dir']
        if not os.path.exists(weight_dir):
            missing_directories.append(path)
    assert len(missing_directories) == 0, "Missing weight_dirs in: " + str(missing_directories)
def renew_data_paths(global_settings):
    addition = ut.create_infoPath_addition(global_settings)
    channel_dir = os.path.expandvars(
        os.path.join(global_settings['output_dir'], 'run_info'))
    info_file = os.path.join(channel_dir, addition, 'info.json')
    info_dict = ut.read_json_cfg(info_file)
    paths = info_dict['tauID_training'][global_settings['tauID_training']]
    for key in paths:
        path = paths[key]
        paths[key] = path.replace(
            path.split('/hhAnalysis')[0], '/scratch-persistent')
    ut.save_dict_to_json(info_dict, info_file)
def get_original_input_paths(global_settings):
    info_dir = os.path.join(os.path.expandvars('$CMSSW_BASE'),
                            'src/machineLearning/machineLearning/info/HH')
    addition = ut.create_infoPath_addition(global_settings)
    info_file = os.path.join(info_dir, global_settings['channel'], addition,
                             'info.json')
    info_dict = ut.read_json_cfg(info_file)
    paths = info_dict['tauID_training'][global_settings['tauID_training']]
    for key in paths.keys():
        path = paths[key]
        paths[key] = path.replace(
            path.split('/hhAnalysis')[0],
            path.split('/hhAnalysis')[0] + '/.')
    return paths
Beispiel #7
0
 def interpret_info_file(self):
     info_path = os.path.join(self.info_dir, 'info.json')
     info_dict = ut.read_json_cfg(info_path)
     self.parameters = {}
     tau_id_application = info_dict.pop('tauID_application')
     default_tauID = info_dict['default_tauID_application']
     tau_id_training = info_dict['tauID_training_key']
     self.parameters['tauID_application'] = tau_id_application[
         default_tauID]
     self.parameters.update(
         self.find_input_paths(info_dict, tau_id_training))
     self.parameters.update(self.load_era_keys(info_dict.pop('keys')))
     self.parameters['trainvars'], trainvars_info = self.load_trainvars()
     self.parameters['trainvar_info'] = trainvars_info
     self.parameters['all_trainvar_info'] = self.read_trainvar_info(
         self.all_trainvars_path)
     self.parameters.update(info_dict)
Beispiel #8
0
 def set_background_sample_info(self, path):
     target = 0
     background_catfile = os.path.join(
         os.path.expandvars('$CMSSW_BASE'),
         'src/machineLearning/machineLearning/info',
         'HH',
         'background_categories.json'
     )
     background_categories = ut.read_json_cfg(background_catfile)
     possible_processes = []
     sample_dict = {}
     for category in background_categories:
         possible_samples = background_categories[category]
         sample_dict.update(possible_samples)
         for sample in possible_samples.keys():
             if sample in path:
                 possible_processes.append(sample)
     process = sample_dict[max(possible_processes, key=len)]
     return process, target
Beispiel #9
0
 def data_cutting(self, data):
     package_dir = os.path.join(
         os.path.expandvars('$CMSSW_BASE'),
         'src/machineLearning/machineLearning/'
     )
     if 'nonres' in self.global_settings['scenario']:
         addition = self.global_settings['scenario']
     else:
         addition = 'res/%s' %(self.global_settings['scenario'])
     if self.global_settings['dataCuts'] == 1:
         cut_file = os.path.join(
             package_dir, 'info', self.global_settings['process'],
             self.global_settings['channel'], addition, 'cuts.json'
         )
     else:
         cut_file = os.path.join(
             package_dir,
             'info',
             self.global_settings['process'],
             self.global_settings['channel'],
             addition,
             self.global_settings['dataCuts']
         )
     if os.path.exists(cut_file):
         cut_dict = ut.read_json_cfg(cut_file)
         if cut_dict == {}:
             print('No cuts given in the cut file %s' % cut_file)
         else:
             cut_keys = list(cut_dict.keys())
             for key in cut_keys:
                 try:
                     min_value = cut_dict[key]['min']
                     data = data.loc[(data[key] >= min_value)]
                 except KeyError:
                     print('Minimum condition for %s not implemented' % key)
                 try:
                     max_value = cut_dict[key]['max']
                     data = data.loc[(data[key] <= max_value)]
                 except KeyError:
                     print('Maximum condition for %s not implemented' % key)
     else:
         print('Cut file %s does not exist' % cut_file)
     return data
Beispiel #10
0
def main(hyperparameter_file, output_dir):
    settings_dir = os.path.join(output_dir, 'run_settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    nthread = global_settings['nthread']
    path = Path(hyperparameter_file)
    save_dir = str(path.parent)
    hyperparameters = ut.read_json_cfg(hyperparameter_file)
    addition = ut.create_infoPath_addition(global_settings)
    channel_dir = os.path.join(output_dir, 'run_info')
    info_dir = os.path.join(channel_dir, addition)
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    global_settings['debug'] = False
    data_file = os.path.join(output_dir, 'data.csv')
    data = pandas.read_csv(data_file)
    if bool(global_settings['use_kfold']):
        score, train, test = et.kfold_cv(
            xt.model_evaluation_main,
            data,
            preferences['trainvars'],
            global_settings,
            hyperparameters
        )
    else:
        score, train, test = et.get_evaluation(
            xt.model_evaluation_main,
            data,
            preferences['trainvars'],
            global_settings,
            hyperparameters
        )
    score_path = os.path.join(save_dir, 'score.json')
    score_dict = {
        global_settings['fitness_fn']: score,
        'train': train,
        'test': test
    }
    with open(score_path, 'w') as score_file:
        json.dump(score_dict, score_file)
Beispiel #11
0
 def get_ntuple_paths(self, input_path, folder_name, file_type='hadd*Tight.root'):
     paths = []
     background_catfile = os.path.join(
         os.path.expandvars('$CMSSW_BASE'),
         'src/machineLearning/machineLearning/info',
         'HH',
         'background_categories.json'
     )
     background_categories = ut.read_json_cfg(background_catfile)
     if 'signal' not in folder_name and folder_name in background_categories.keys():
         bkg_elements = background_categories[folder_name]
         for bkg_element in bkg_elements:
             bkg_element_paths = self.find_paths_both_conventions(
                 input_path, bkg_element, file_type=file_type)
             print('--------------')
             print(bkg_element)
             print(bkg_element_paths)
             paths.extend(bkg_element_paths)
     else:
         paths = self.find_paths_both_conventions(
             input_path, folder_name + '*', file_type=file_type)
     return paths
Beispiel #12
0
def main(output_dir, settings_dir, hyperparameter_file, debug):
    if settings_dir == 'None':
        settings_dir = os.path.join(
            os.path.expandvars('$CMSSW_BASE'),
            'src/machineLearning/machineLearning/settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    if output_dir == 'None':
        output_dir = global_settings['output_dir']
    else:
        global_settings['output_dir'] = output_dir
    global_settings['output_dir'] = os.path.expandvars(
        global_settings['output_dir'])
    if not os.path.exists(global_settings['output_dir']):
        os.makedirs(global_settings['output_dir'])
    channel_dir, info_dir, _ = ut.find_settings()
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    if hyperparameter_file == 'None':
        hyperparameter_file = os.path.join(info_dir, 'hyperparameters.json')
    hyperparameters = ut.read_json_cfg(hyperparameter_file)
    evaluation_main(global_settings, preferences, hyperparameters, debug)
Beispiel #13
0
def main(output_dir, settings_dir, hyperparameter_file, debug):
    if settings_dir == 'None':
        settings_dir = os.path.join(
            os.path.expandvars('$CMSSW_BASE'),
            'src/machineLearning/machineLearning/settings')
    global_settings = settings_dir + '/' + 'global_%s_%s_%s_settings.json' % (
        channel, mode, res_nonres)
    command = 'rsync %s ~/machineLearning/CMSSW_11_2_0_pre1/src/machineLearning/machineLearning/settings/global_settings.json' % global_settings
    p = subprocess.Popen(command,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    global_settings = ut.read_settings(settings_dir, 'global')
    if output_dir == 'None':
        output_dir = global_settings['channel']+'/'+global_settings['ml_method']+'/'+\
                     res_nonres + '/' + mode +'/' + era
        global_settings['output_dir'] = output_dir
    else:
        global_settings['output_dir'] = output_dir
    global_settings['output_dir'] = os.path.expandvars(
        global_settings['output_dir'])
    if not os.path.exists(global_settings['output_dir']):
        os.makedirs(global_settings['output_dir'])
    channel_dir, info_dir, _ = ut.find_settings()
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    if not BM == 'None':
        preferences["nonResScenarios"] = [BM]
    print('BM point to be considered: ' + str(preferences["nonResScenarios"]))
    if not era == '0':
        preferences['included_eras'] = [era.replace('20', '')]
    print('era: ' + str(preferences['included_eras']))
    preferences = define_trainvars(global_settings, preferences, info_dir)
    if hyperparameter_file == 'None':
        hyperparameter_file = os.path.join(info_dir, 'hyperparameters.json')
    hyperparameters = ut.read_json_cfg(hyperparameter_file)
    print('hyperparametrs ' + str(hyperparameters))
    evaluation_main(global_settings, preferences, hyperparameters, debug)
Beispiel #14
0
def read_fitness(output_dir, fitness_key='d_roc'):
    """Creates the list of score dictionaries of each sample. List is ordered
    according to the number of the sample

    Parameters:
    ----------
    output_dir : str
        Path to the directory of output

    Returns:
    -------
    scores : list of floats
        List of fitnesses
    """
    samples = os.path.join(output_dir, 'samples')
    wild_card_path = os.path.join(samples, '*', 'score.json')
    number_samples = len(glob.glob(wild_card_path))
    score_dicts = []
    for number in range(number_samples):
        path = os.path.join(samples, str(number), 'score.json')
        score_dict = ut.read_json_cfg(path)
        score_dicts.append(score_dict)
    scores = [score_dict[fitness_key] for score_dict in score_dicts]
    return scores
def test_read_parameters():
    path_to_test_file = os.path.join(resources_dir, 'best_parameters.json')
    result = ut.read_json_cfg(path_to_test_file)
    expected = {'a': 1, 'b': 2, 'c': 3}
    assert result == expected