def main(to_continue, opt_dir, bbww):
    if not to_continue:
        settings_dir = os.path.join(
            os.path.expandvars('$CMSSW_BASE'),
            'src/machineLearning/machineLearning/settings')
        global_settings = ut.read_settings(settings_dir, 'global')
        output_dir = os.path.expandvars(global_settings['output_dir'])
    else:
        settings_dir = os.path.join(opt_dir, 'run_settings')
        global_settings = ut.read_settings(settings_dir, 'global')
        output_dir = opt_dir
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    if not to_continue:
        if not os.path.exists(os.path.join(output_dir, 'run_settings')):
            ut.save_run_settings(output_dir)
        if not os.path.exists(os.path.join(output_dir, 'run_info')):
            ut.save_info_dir(output_dir)
    # use_scratch_for_data(global_settings)
    print("::::::: Reading parameters :::::::")
    if global_settings['ml_method'] == 'xgb':
        param_file = os.path.join(settings_dir, 'xgb_parameters.json')
    else:
        param_file = os.path.join(settings_dir, 'nn_parameters.json')
    hyperparameter_info = ut.read_json_cfg(param_file)
    pso_settings = ut.read_settings(settings_dir, 'pso')
    pso_settings.update(global_settings)
    addition = ut.create_infoPath_addition(global_settings)
    channel_dir = os.path.join(output_dir, 'run_info')
    info_dir = os.path.join(channel_dir, addition)
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    normalizer = hht.HHDataNormalizer if not bbww else bbwwt.bbWWDataNormalizer
    if os.path.exists(preferences['data_csv']):
        print(':::::::: Loading data from .csv file ::::::::')
        data = pandas.read_csv(preferences['data_csv'])
    else:
        print('::::::: Loading data to be saved to pandas.DataFrame :::::::')
        if not bbww:
            loader = hht.HHDataLoader(normalizer, preferences, global_settings)
        else:
            loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings)
        data = loader.data
        if global_settings['ml_method'] in ['lbn', 'nn']:
            use_Wjets = True
            if 'bb2l' in global_settings['channel']:
                use_Wjets = False
            data = mt.multiclass_encoding(data, use_Wjets)
        loader.save_to_csv()
    print("\n============ Starting hyperparameter optimization ==========\n")
    swarm = pt.ParticleSwarm(pso_settings, st.get_fitness_score,
                             hyperparameter_info, to_continue, output_dir)
    optimal_hyperparameters = swarm.particleSwarmOptimization()[0]
    print("\n============ Saving results ================\n")
    best_parameters_path = os.path.join(output_dir,
                                        'best_hyperparameters.json')
    ut.save_dict_to_json(optimal_hyperparameters, best_parameters_path)
    print("Results saved to " + str(output_dir))
def main():
    cmssw_path = os.path.expandvars('$CMSSW_BASE')
    package_dir = os.path.join(cmssw_path,
                               'src/machineLearning/machineLearning')
    settings_dir = os.path.join(package_dir, 'settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    modes = ['nonres/default', 'spin0', 'spin2']
    table_infos = []
    output_file = os.path.expandvars(
        os.path.join(global_settings['output_dir'], 'EventYield.tex'))
    for mode in modes:
        global_settings['scenario'] = mode
        channel_dir = os.path.join(package_dir, 'info', 'HH',
                                   global_settings['channel'])
        reader = hpr.HHParameterReader(channel_dir, mode)
        preferences = reader.parameters
        normalizer = hht.HHDataNormalizer
        loader = hht.HHDataLoader(normalizer, preferences, global_settings)
        mode_data = loader.data
        for era in set(mode_data['era']):
            era_data = mode_data.loc[mode_data['era'] == era]
            channel = global_settings['channel']
            table_creator = eyc.EventYieldTable(era_data, channel, era, mode)
            table_info = table_creator.create_table()
            table_infos.append(table_info)
    table_writer = eyc.EventYieldsFile(table_infos, output_file)
    table_writer.fill_document_file()
    print('File saved to %s' % output_file)
def choose_trainvar(datacard_dir, channel, trainvar, bdt_type):
    '''Reads the training variables from the data folder from file 
    'optimization_trainvars.txt'. Is used for the xgb_tth cf function.

    Parametrs:
    ---------
    datacard_dir : dummy argument
        Needed for compability with the other trainvars loading
    channel : dummy argument
        Needed for compability with the other trainvars loading
    trainvar : dummy argument
        Needed for compability with texpandvarshe other trainvars loading
    bdt_type : dummy argument
        Needed for compability with the other trainvars loading

    Returns:
    -------
    trainvars : list
        list of trainvars that are to be used in the optimization.
    '''
    global_settings = ut.read_settings('global')
    out_dir = os.path.expandvars(global_settings['output_dir'])
    trainvars_path = os.path.join(out_dir, 'optimization_trainvars.txt')
    try:
        trainvars = dlt.read_list(trainvars_path)
    except:
        print('Could not find trainvars')
        trainvars = ''
    return trainvars
Beispiel #4
0
def main(output_dir, save_model, channel, mode, era, BM):
    settings_dir = os.path.join(
        os.path.expandvars('$CMSSW_BASE'),
        'src/machineLearning/machineLearning/settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    global_settings['ml_method'] = 'lbn'
    global_settings['channel'] = 'bb1l'
    if output_dir == 'None':
        output_dir = global_settings['channel']+'/'+global_settings['ml_method']+'/'+\
                     res_nonres + '/' + mode +'/' + era
        global_settings['output_dir'] = output_dir
    else:
        global_settings['output_dir'] = output_dir
    global_settings['output_dir'] = os.path.expandvars(
        global_settings['output_dir'])
    if not os.path.exists(global_settings['output_dir']):
        os.makedirs(global_settings['output_dir'])
    channel_dir, info_dir, _ = ut.find_settings()
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    if not BM == 'None':
        preferences["nonResScenarios"] = [BM]
    print('BM point to be considered: ' + str(preferences["nonResScenarios"]))
    if not era == '0':
        preferences['included_eras'] = [era.replace('20', '')]
    print('era: ' + str(preferences['included_eras']))
    preferences = define_trainvars(global_settings, preferences, info_dir)
    particles = PARTICLE_INFO[global_settings['channel']]
    data_dict = create_data_dict(preferences, global_settings)
    classes = set(data_dict["even_data"]["process"])
    for class_ in classes:
        multitarget = list(
            set(data_dict["even_data"].loc[data_dict["even_data"]["process"] ==
                                           class_, "multitarget"]))[0]
        print(str(class_) + '\t' + str(multitarget))
    even_model = create_model(preferences, global_settings, data_dict,
                              "even_data", save_model)
    if global_settings['feature_importance'] == 1:
        trainvars = preferences['trainvars']
        data = data_dict['odd_data']
        LBNFeatureImportance = nt.LBNFeatureImportances(even_model, data,\
            trainvars, global_settings['channel'])
        score_dict = LBNFeatureImportance.custom_permutation_importance()
        hhvt.plot_feature_importances_from_dict(score_dict,
                                                global_settings['output_dir'])
    odd_model = create_model(preferences, global_settings, data_dict,
                             "odd_data", save_model)
    print(odd_model.summary())
    nodewise_performance(data_dict['odd_data'], data_dict['even_data'],\
        odd_model, even_model, data_dict['trainvars'], particles, \
        global_settings, preferences)
    even_train_info, even_test_info = evaluate_model(
        even_model, data_dict['even_data'], data_dict['odd_data'],\
        data_dict['trainvars'], global_settings, "even_data", particles)
    odd_train_info, odd_test_info = evaluate_model(
        odd_model, data_dict['odd_data'], data_dict['even_data'], \
        data_dict['trainvars'], global_settings, "odd_data", particles)
    hhvt.plotROC([odd_train_info, odd_test_info],
                 [even_train_info, even_test_info], global_settings)
Beispiel #5
0
def main():
    settings_dir = os.path.join(
        os.path.expandvars('$CMSSW_BASE'),
        'src/machineLearning/machineLearning/settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    output_dir = os.path.expandvars(global_settings['output_dir'])
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    ut.save_run_settings(output_dir)
    print("::::::: Reading parameters :::::::")
    param_file = os.path.join(settings_dir, 'xgb_parameters.json')
    value_dicts = ut.read_parameters(param_file)
    pso_settings = ut.read_settings(settings_dir, 'pso')
    hyperparameter_sets = xt.prepare_run_params(value_dicts,
                                                pso_settings['sample_size'])
    print("\n============ Starting hyperparameter optimization ==========\n")
    best_hyperparameters = pt.run_pso(value_dicts, st.get_fitness_score,
                                      hyperparameter_sets, output_dir)
    print("\n============ Saving results ================\n")
    best_parameters_path = os.path.join(output_dir,
                                        'best_hyperparameters.json')
    ut.save_dict_to_json(best_hyperparameters, best_parameters_path)
    print("Results saved to " + str(output_dir))
def initialize_trainvars(channel='2l_2tau', process='HH', random_sample='TTZ'):
    '''Reads in all the possible trainvars for initial run

    Parameters:
    ----------
    channel : str
        Name of the channel where the .root file is taken from (e.g 2l_2tau)
    process : str
        Name of the process where the .root is loaded (e.g. ttH or HH)
    random_sample : str
        A random sample the .root file tha is to be loaded belongs to

    Returns:
    trainvars : list
        list of all possible trainvars that are to be used in the optimization
    '''
    info_folder = os.path.join(os.path.expandvars('$CMSSW_BASE'),
                               'src/machineLearning/machineLearning/info')
    inputpath_info_path = os.path.join(info_folder, process, channel,
                                       'tauID_training.json')
    info_dict = ut.read_parameters(inputpath_info_path)[1]
    path_to_files = info_dict['inputPath']
    wildcard_root_files = os.path.join(path_to_files,
                                       '*' + random_sample + '*', 'central',
                                       '*.root')
    single_root_file = glob.glob(wildcard_root_files)[0]
    channel_info_path = os.path.join(info_folder, process, channel,
                                     'info.json')
    channel_info_dict = ut.read_multiline_json_to_dict(channel_info_path)
    channel_in_tree = channel_info_dict['channelInTree']
    samplename_info = os.path.join(info_folder, 'samplename_info.json')
    global_settings = ut.read_settings('global')
    samplename_info = ut.read_parameters(samplename_info)
    folder_name = random_sample
    sample_dict = dlt.find_sample(folder_name, samplename_info)
    if sample_dict == {}:
        sample_dict = dl.advanced_sample_name(
            global_settings['bdtType'], folder_name,
            [])  # TTZ is just a random choice
    sample_name = sample_dict['sampleName']
    input_tree = str(
        os.path.join(channel_in_tree, 'sel/evtntuple', sample_name, 'evtTree'))
    trainvars = access_ttree(single_root_file, input_tree)
    trainvars = data_related_trainvars(trainvars)
    return trainvars
Beispiel #7
0
def get_fitness_score(hyperparameter_sets, global_settings, sample_size=0):
    '''The main function call that is the slurm equivalent of ensemble_fitness
    in xgb_tools

    Parameters:
    ----------
    hyperparameter_sets : list of dicts
        Parameter-sets for all particles
    global_settings : dict
        Global settings for the hyperparameter optimization
    sample_size: integer
        Sample size in case where it does not correspond to the value given
        in the settings file

    Returns:
    -------
    scores : list of floats
        Fitnesses for each hyperparameter-set
    '''
    output_dir = os.path.expandvars(global_settings['output_dir'])
    previous_files_dir = os.path.join(output_dir, 'previous_files')
    if not os.path.exists(previous_files_dir):
        os.makedirs(previous_files_dir)
    settings_dir = os.path.join(output_dir, 'run_settings')
    if sample_size == 0:
        opt_settings = ut.read_settings(settings_dir,
                                        global_settings['optimization_algo'])
        sample_size = opt_settings['sample_size']
    parameters_to_file(output_dir, hyperparameter_sets)
    wild_card_path = os.path.join(output_dir, 'samples', '*',
                                  'parameters.json')
    zero_sized = 1
    while zero_sized != 0:
        zero_sized = check_parameter_file_sizes(wild_card_path)
        time.sleep(2)
    for parameter_file in glob.glob(wild_card_path):
        sample_nr = get_sample_nr(parameter_file)
        job_file = prepare_job_file(parameter_file, sample_nr, global_settings)
        subprocess.call(['sbatch', job_file])
    wait_iteration(output_dir, sample_size)
    time.sleep(30)
    scores = read_fitness(output_dir, global_settings['fitness_fn'])
    move_previous_files(output_dir, previous_files_dir)
    return scores
Beispiel #8
0
def main(hyperparameter_file, output_dir):
    settings_dir = os.path.join(output_dir, 'run_settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    nthread = global_settings['nthread']
    path = Path(hyperparameter_file)
    save_dir = str(path.parent)
    hyperparameters = ut.read_json_cfg(hyperparameter_file)
    addition = ut.create_infoPath_addition(global_settings)
    channel_dir = os.path.join(output_dir, 'run_info')
    info_dir = os.path.join(channel_dir, addition)
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    global_settings['debug'] = False
    data_file = os.path.join(output_dir, 'data.csv')
    data = pandas.read_csv(data_file)
    if bool(global_settings['use_kfold']):
        score, train, test = et.kfold_cv(
            xt.model_evaluation_main,
            data,
            preferences['trainvars'],
            global_settings,
            hyperparameters
        )
    else:
        score, train, test = et.get_evaluation(
            xt.model_evaluation_main,
            data,
            preferences['trainvars'],
            global_settings,
            hyperparameters
        )
    score_path = os.path.join(save_dir, 'score.json')
    score_dict = {
        global_settings['fitness_fn']: score,
        'train': train,
        'test': test
    }
    with open(score_path, 'w') as score_file:
        json.dump(score_dict, score_file)
Beispiel #9
0
def main(output_dir, settings_dir, hyperparameter_file, debug):
    if settings_dir == 'None':
        settings_dir = os.path.join(
            os.path.expandvars('$CMSSW_BASE'),
            'src/machineLearning/machineLearning/settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    if output_dir == 'None':
        output_dir = global_settings['output_dir']
    else:
        global_settings['output_dir'] = output_dir
    global_settings['output_dir'] = os.path.expandvars(
        global_settings['output_dir'])
    if not os.path.exists(global_settings['output_dir']):
        os.makedirs(global_settings['output_dir'])
    channel_dir, info_dir, _ = ut.find_settings()
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    if hyperparameter_file == 'None':
        hyperparameter_file = os.path.join(info_dir, 'hyperparameters.json')
    hyperparameters = ut.read_json_cfg(hyperparameter_file)
    evaluation_main(global_settings, preferences, hyperparameters, debug)
Beispiel #10
0
def main(output_dir, settings_dir, hyperparameter_file, debug):
    if settings_dir == 'None':
        settings_dir = os.path.join(
            os.path.expandvars('$CMSSW_BASE'),
            'src/machineLearning/machineLearning/settings')
    global_settings = settings_dir + '/' + 'global_%s_%s_%s_settings.json' % (
        channel, mode, res_nonres)
    command = 'rsync %s ~/machineLearning/CMSSW_11_2_0_pre1/src/machineLearning/machineLearning/settings/global_settings.json' % global_settings
    p = subprocess.Popen(command,
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    global_settings = ut.read_settings(settings_dir, 'global')
    if output_dir == 'None':
        output_dir = global_settings['channel']+'/'+global_settings['ml_method']+'/'+\
                     res_nonres + '/' + mode +'/' + era
        global_settings['output_dir'] = output_dir
    else:
        global_settings['output_dir'] = output_dir
    global_settings['output_dir'] = os.path.expandvars(
        global_settings['output_dir'])
    if not os.path.exists(global_settings['output_dir']):
        os.makedirs(global_settings['output_dir'])
    channel_dir, info_dir, _ = ut.find_settings()
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    if not BM == 'None':
        preferences["nonResScenarios"] = [BM]
    print('BM point to be considered: ' + str(preferences["nonResScenarios"]))
    if not era == '0':
        preferences['included_eras'] = [era.replace('20', '')]
    print('era: ' + str(preferences['included_eras']))
    preferences = define_trainvars(global_settings, preferences, info_dir)
    if hyperparameter_file == 'None':
        hyperparameter_file = os.path.join(info_dir, 'hyperparameters.json')
    hyperparameters = ut.read_json_cfg(hyperparameter_file)
    print('hyperparametrs ' + str(hyperparameters))
    evaluation_main(global_settings, preferences, hyperparameters, debug)
Beispiel #11
0
def main(hyperparameter_file, output_dir):
    settings_dir = os.path.join(output_dir, 'run_settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    num_classes = global_settings['num_classes']
    nthread = global_settings['nthread']
    path = Path(hyperparameter_file)
    save_dir = str(path.parent)
    hyperparameters = ut.read_parameters(hyperparameter_file)[0]
    preferences = dlt.get_parameters(global_settings['process'],
                                     global_settings['channel'],
                                     global_settings['bkg_mass_rand'],
                                     global_settings['tauID_training'])
    data = dlt.load_data(
        preferences['inputPath'],
        preferences['channelInTree'],
        preferences['trainvars'],
        global_settings['bdtType'],
        global_settings['channel'],
        preferences['keys'],
        preferences['masses'],
        global_settings['bkg_mass_rand'],
    )
    dlt.reweigh_dataframe(data, preferences['weight_dir'],
                          preferences['trainvars'], ['gen_mHH'],
                          preferences['masses'])
    normalize_hh_dataframe(data, preferences, global_settings)
    if bool(global_settings['use_kfold']):
        score = et.kfold_cv(xt.model_evaluation_main, data,
                            preferences['trainvars'], global_settings,
                            hyperparameters)
    else:
        score, pred_train, pred_test = et.get_evaluation(
            xt.model_evaluation_main, data, preferences['trainvars'],
            global_settings, hyperparameters)
        st.save_prediction_files(pred_train, pred_test, save_dir)
    score_path = os.path.join(save_dir, 'score.json')
    with open(score_path, 'w') as score_file:
        json.dump({global_settings['fitness_fn']: score}, score_file)
Beispiel #12
0
def test_read_settings():
    pso_settings = ut.read_settings(settings_dir, 'pso')
    global_settings = ut.read_settings(settings_dir, 'global')
    assert len(pso_settings.keys()) == 7
    assert len(global_settings.keys()) == 14
Beispiel #13
0
def run_pso(value_dicts, calculate_fitnesses, hyperparameter_sets, output_dir):
    '''Performs the whole particle swarm optimization. Pay attention that the
    best fitness has the maximum value, not the minimum. (multiply by -1 if
    needed)

    Parameters:
    ----------
    value_dicts : list of dicts
        Info about every variable that is to be optimized
    calculate_fitness : method
        Function that calculates the fitness and returns the  score
    hyperparameter_sets : list of dicts
        The parameter-sets of all particles.
    output_dir : str
        Path to the directory of the output

    Returns:
    -------
    best_hyperparameters : dict
        Best hyperparameters found.
    '''
    print(':::::::: Initializing :::::::::')
    settings_dir = os.path.join(output_dir, 'run_settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    pso_settings = ut.read_settings(settings_dir, 'pso')
    inertial_weight, inertial_weight_step = get_weight_step(pso_settings)
    iteration = 1
    new_hyperparameter_sets = hyperparameter_sets
    personal_bests = {}
    compactness = et.calculate_compactness(hyperparameter_sets)
    fitnesses = calculate_fitnesses(hyperparameter_sets, global_settings)
    personal_bests = hyperparameter_sets
    best_fitnesses = fitnesses
    index = np.argmax(fitnesses)
    best_hyperparameters = hyperparameter_sets[index]
    best_fitness = fitnesses[index]
    current_speeds = initialize_speeds(hyperparameter_sets)
    max_iterations_not_reached = True
    not_clustered = True
    while max_iterations_not_reached and not_clustered:
        print('::::::: Iteration: ' + str(iteration) + ' ::::::::')
        hyperparameter_sets = new_hyperparameter_sets
        compactness = et.calculate_compactness(hyperparameter_sets)
        print(' --- Compactness: ' + str(compactness) + ' ---')
        fitnesses = calculate_fitnesses(hyperparameter_sets, global_settings)
        best_fitnesses = find_best_fitness(fitnesses, best_fitnesses)
        personal_bests = calculate_personal_bests(fitnesses, best_fitnesses,
                                                  hyperparameter_sets,
                                                  personal_bests)
        weight_dict = {
            'c1': pso_settings['c1'],
            'c2': pso_settings['c2'],
            'w': inertial_weight
        }
        new_hyperparameter_sets, current_speeds = prepare_new_day(
            personal_bests, hyperparameter_sets, best_hyperparameters,
            current_speeds, value_dicts, weight_dict)
        index = np.argmax(fitnesses)
        if best_fitness < max(fitnesses):
            best_hyperparameters = hyperparameter_sets[index]
            best_fitness = fitnesses[index]
        inertial_weight += inertial_weight_step
        iteration += 1
        max_iterations_not_reached = iteration <= pso_settings['iterations']
        not_clustered = pso_settings['compactness_threshold'] < compactness
    return best_hyperparameters