def prepare_data(analysis): channel_dir, info_dir, global_settings = ut.find_settings() scenario = global_settings['scenario'] reader = hpr.HHParameterReader(channel_dir, scenario) preferences = reader.parameters preferences['trainvars'] = preferences['all_trainvar_info'].keys() startTime = datetime.now() print('data loading is started: ' + str(startTime)) if analysis == 'HHmultilepton': normalizer = hht.HHDataNormalizer loader = hht.HHDataLoader(normalizer, preferences, global_settings) elif analysis == 'HHbbWW': normalizer = bbwwt.bbWWDataNormalizer loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings) data = loader.data print('data loading is finished') print(datetime.now() - startTime) scenario = global_settings['scenario'] scenario = scenario if 'nonres' in scenario else 'res/' + scenario hyperparameters_file = os.path.join( os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/info/', global_settings['process'], global_settings['channel'], scenario, 'hyperparameters.json') with open(hyperparameters_file, 'rt') as in_file: preferences['hyperparameters'] = json.load(in_file) return data, preferences, global_settings
def main(to_continue, opt_dir, bbww): if not to_continue: settings_dir = os.path.join( os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/settings') global_settings = ut.read_settings(settings_dir, 'global') output_dir = os.path.expandvars(global_settings['output_dir']) else: settings_dir = os.path.join(opt_dir, 'run_settings') global_settings = ut.read_settings(settings_dir, 'global') output_dir = opt_dir if not os.path.isdir(output_dir): os.makedirs(output_dir) if not to_continue: if not os.path.exists(os.path.join(output_dir, 'run_settings')): ut.save_run_settings(output_dir) if not os.path.exists(os.path.join(output_dir, 'run_info')): ut.save_info_dir(output_dir) # use_scratch_for_data(global_settings) print("::::::: Reading parameters :::::::") if global_settings['ml_method'] == 'xgb': param_file = os.path.join(settings_dir, 'xgb_parameters.json') else: param_file = os.path.join(settings_dir, 'nn_parameters.json') hyperparameter_info = ut.read_json_cfg(param_file) pso_settings = ut.read_settings(settings_dir, 'pso') pso_settings.update(global_settings) addition = ut.create_infoPath_addition(global_settings) channel_dir = os.path.join(output_dir, 'run_info') info_dir = os.path.join(channel_dir, addition) scenario = global_settings['scenario'] reader = hpr.HHParameterReader(channel_dir, scenario) preferences = reader.parameters normalizer = hht.HHDataNormalizer if not bbww else bbwwt.bbWWDataNormalizer if os.path.exists(preferences['data_csv']): print(':::::::: Loading data from .csv file ::::::::') data = pandas.read_csv(preferences['data_csv']) else: print('::::::: Loading data to be saved to pandas.DataFrame :::::::') if not bbww: loader = hht.HHDataLoader(normalizer, preferences, global_settings) else: loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings) data = loader.data if global_settings['ml_method'] in ['lbn', 'nn']: use_Wjets = True if 'bb2l' in global_settings['channel']: use_Wjets = False data = mt.multiclass_encoding(data, use_Wjets) loader.save_to_csv() print("\n============ Starting hyperparameter optimization ==========\n") swarm = pt.ParticleSwarm(pso_settings, st.get_fitness_score, hyperparameter_info, to_continue, output_dir) optimal_hyperparameters = swarm.particleSwarmOptimization()[0] print("\n============ Saving results ================\n") best_parameters_path = os.path.join(output_dir, 'best_hyperparameters.json') ut.save_dict_to_json(optimal_hyperparameters, best_parameters_path) print("Results saved to " + str(output_dir))
def split_data(global_settings, preferences): print('============ Starting evaluation ============') if os.path.exists(preferences['data_csv']): data = pandas.read_csv(preferences['data_csv']) else: normalizer = bbwwt.bbWWDataNormalizer loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings) data = loader.data hhvt.plot_trainvar_multi_distributions(data, preferences['trainvars'], global_settings['output_dir']) hhvt.plot_correlations(data, preferences['trainvars'], global_settings) keysNotToSplit = [] if '3l_1tau' in global_settings['channel']: keysNotToSplit = ['WZ', 'DY', 'TTTo'] print('These keys are excluded from splitting: ', keysNotToSplit) evtNotToSplit = (data['key'].isin(keysNotToSplit)) evtEven = (data['event'].values % 2 == 0) evtOdd = ~(data['event'].values % 2 == 0) even_data = data.loc[np.logical_or(evtEven, evtNotToSplit)] odd_data = data.loc[np.logical_or(evtOdd, evtNotToSplit)] return even_data, odd_data
def create_data_dict(preferences, global_settings): normalizer = bbwwt.bbWWDataNormalizer loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings) data = loader.data hhvt.plot_single_mode_correlation(data, preferences['trainvars'], global_settings['output_dir'], 'trainvar') hhvt.plot_trainvar_multi_distributions(data, preferences['trainvars'], global_settings['output_dir']) sumall = data.loc[data["process"] == "TT"]["totalWeight"].sum() \ + data.loc[data["process"] == "W"]["totalWeight"].sum() \ + data.loc[data["process"] == "DY"]["totalWeight"].sum() \ + data.loc[data["process"] == "ST"]["totalWeight"].sum() \ + data.loc[data["process"] == "Other"]["totalWeight"].sum() \ + data.loc[data["target"] == 1]["totalWeight"].sum() print( "TT:W:DY:ST:Other:HH \t" \ + str("%0.3f" %(data.loc[data["process"] == "TT"]["totalWeight"].sum()/sumall)) \ + ":" + str("%0.3f" %(data.loc[data["process"] == "W"]["totalWeight"].sum()/sumall)) \ + ":" + str("%0.3f" %(data.loc[data["process"] == "DY"]["totalWeight"].sum()/sumall)) \ + ":" + str("%0.3f" %(data.loc[data["process"] == 'ST']["totalWeight"].sum()/sumall)) \ + ":" + str("%0.3f" %(data.loc[data["process"] == 'Other']["totalWeight"].sum()/sumall)) \ + ":" + str("%0.3f" %(data.loc[data["target"] == 1]["totalWeight"].sum()/sumall)) ) use_Wjet = True if 'bb2l' in global_settings['channel']: use_Wjet = False data = mt.multiclass_encoding(data, use_Wjet) hhvt.plot_correlations(data, preferences["trainvars"], global_settings) even_data = data.loc[(data['event'].values % 2 == 0)] odd_data = data.loc[~(data['event'].values % 2 == 0)] data_dict = { 'trainvars': preferences['trainvars'], 'odd_data': odd_data, 'even_data': even_data } return data_dict