コード例 #1
0
def prepare_data(analysis):
    channel_dir, info_dir, global_settings = ut.find_settings()
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    preferences['trainvars'] = preferences['all_trainvar_info'].keys()
    startTime = datetime.now()
    print('data loading is started: ' + str(startTime))
    if analysis == 'HHmultilepton':
        normalizer = hht.HHDataNormalizer
        loader = hht.HHDataLoader(normalizer, preferences, global_settings)
    elif analysis == 'HHbbWW':
        normalizer = bbwwt.bbWWDataNormalizer
        loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings)
    data = loader.data
    print('data loading is finished')
    print(datetime.now() - startTime)
    scenario = global_settings['scenario']
    scenario = scenario if 'nonres' in scenario else 'res/' + scenario
    hyperparameters_file = os.path.join(
        os.path.expandvars('$CMSSW_BASE'),
        'src/machineLearning/machineLearning/info/',
        global_settings['process'], global_settings['channel'], scenario,
        'hyperparameters.json')
    with open(hyperparameters_file, 'rt') as in_file:
        preferences['hyperparameters'] = json.load(in_file)
    return data, preferences, global_settings
コード例 #2
0
def main(to_continue, opt_dir, bbww):
    if not to_continue:
        settings_dir = os.path.join(
            os.path.expandvars('$CMSSW_BASE'),
            'src/machineLearning/machineLearning/settings')
        global_settings = ut.read_settings(settings_dir, 'global')
        output_dir = os.path.expandvars(global_settings['output_dir'])
    else:
        settings_dir = os.path.join(opt_dir, 'run_settings')
        global_settings = ut.read_settings(settings_dir, 'global')
        output_dir = opt_dir
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    if not to_continue:
        if not os.path.exists(os.path.join(output_dir, 'run_settings')):
            ut.save_run_settings(output_dir)
        if not os.path.exists(os.path.join(output_dir, 'run_info')):
            ut.save_info_dir(output_dir)
    # use_scratch_for_data(global_settings)
    print("::::::: Reading parameters :::::::")
    if global_settings['ml_method'] == 'xgb':
        param_file = os.path.join(settings_dir, 'xgb_parameters.json')
    else:
        param_file = os.path.join(settings_dir, 'nn_parameters.json')
    hyperparameter_info = ut.read_json_cfg(param_file)
    pso_settings = ut.read_settings(settings_dir, 'pso')
    pso_settings.update(global_settings)
    addition = ut.create_infoPath_addition(global_settings)
    channel_dir = os.path.join(output_dir, 'run_info')
    info_dir = os.path.join(channel_dir, addition)
    scenario = global_settings['scenario']
    reader = hpr.HHParameterReader(channel_dir, scenario)
    preferences = reader.parameters
    normalizer = hht.HHDataNormalizer if not bbww else bbwwt.bbWWDataNormalizer
    if os.path.exists(preferences['data_csv']):
        print(':::::::: Loading data from .csv file ::::::::')
        data = pandas.read_csv(preferences['data_csv'])
    else:
        print('::::::: Loading data to be saved to pandas.DataFrame :::::::')
        if not bbww:
            loader = hht.HHDataLoader(normalizer, preferences, global_settings)
        else:
            loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings)
        data = loader.data
        if global_settings['ml_method'] in ['lbn', 'nn']:
            use_Wjets = True
            if 'bb2l' in global_settings['channel']:
                use_Wjets = False
            data = mt.multiclass_encoding(data, use_Wjets)
        loader.save_to_csv()
    print("\n============ Starting hyperparameter optimization ==========\n")
    swarm = pt.ParticleSwarm(pso_settings, st.get_fitness_score,
                             hyperparameter_info, to_continue, output_dir)
    optimal_hyperparameters = swarm.particleSwarmOptimization()[0]
    print("\n============ Saving results ================\n")
    best_parameters_path = os.path.join(output_dir,
                                        'best_hyperparameters.json')
    ut.save_dict_to_json(optimal_hyperparameters, best_parameters_path)
    print("Results saved to " + str(output_dir))
コード例 #3
0
def split_data(global_settings, preferences):
    print('============ Starting evaluation ============')
    if os.path.exists(preferences['data_csv']):
        data = pandas.read_csv(preferences['data_csv'])
    else:
        normalizer = bbwwt.bbWWDataNormalizer
        loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings)
        data = loader.data
    hhvt.plot_trainvar_multi_distributions(data, preferences['trainvars'],
                                           global_settings['output_dir'])
    hhvt.plot_correlations(data, preferences['trainvars'], global_settings)
    keysNotToSplit = []
    if '3l_1tau' in global_settings['channel']:
        keysNotToSplit = ['WZ', 'DY', 'TTTo']
        print('These keys are excluded from splitting: ', keysNotToSplit)
    evtNotToSplit = (data['key'].isin(keysNotToSplit))
    evtEven = (data['event'].values % 2 == 0)
    evtOdd = ~(data['event'].values % 2 == 0)
    even_data = data.loc[np.logical_or(evtEven, evtNotToSplit)]
    odd_data = data.loc[np.logical_or(evtOdd, evtNotToSplit)]
    return even_data, odd_data
コード例 #4
0
def create_data_dict(preferences, global_settings):
    normalizer = bbwwt.bbWWDataNormalizer
    loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings)
    data = loader.data

    hhvt.plot_single_mode_correlation(data, preferences['trainvars'],
                                      global_settings['output_dir'],
                                      'trainvar')
    hhvt.plot_trainvar_multi_distributions(data, preferences['trainvars'],
                                           global_settings['output_dir'])
    sumall = data.loc[data["process"] == "TT"]["totalWeight"].sum() \
        + data.loc[data["process"] == "W"]["totalWeight"].sum() \
        + data.loc[data["process"] == "DY"]["totalWeight"].sum() \
        + data.loc[data["process"] == "ST"]["totalWeight"].sum() \
        + data.loc[data["process"] == "Other"]["totalWeight"].sum() \
        + data.loc[data["target"] == 1]["totalWeight"].sum()
    print(
        "TT:W:DY:ST:Other:HH \t" \
        + str("%0.3f" %(data.loc[data["process"] == "TT"]["totalWeight"].sum()/sumall)) \
        + ":" + str("%0.3f" %(data.loc[data["process"] == "W"]["totalWeight"].sum()/sumall)) \
        + ":" + str("%0.3f" %(data.loc[data["process"] == "DY"]["totalWeight"].sum()/sumall)) \
        + ":" + str("%0.3f" %(data.loc[data["process"] == 'ST']["totalWeight"].sum()/sumall)) \
        + ":" + str("%0.3f" %(data.loc[data["process"] == 'Other']["totalWeight"].sum()/sumall)) \
        + ":" + str("%0.3f" %(data.loc[data["target"] == 1]["totalWeight"].sum()/sumall))
    )
    use_Wjet = True
    if 'bb2l' in global_settings['channel']:
        use_Wjet = False
    data = mt.multiclass_encoding(data, use_Wjet)
    hhvt.plot_correlations(data, preferences["trainvars"], global_settings)
    even_data = data.loc[(data['event'].values % 2 == 0)]
    odd_data = data.loc[~(data['event'].values % 2 == 0)]
    data_dict = {
        'trainvars': preferences['trainvars'],
        'odd_data': odd_data,
        'even_data': even_data
    }
    return data_dict