def main(): if len(sys.argv) != 5: print "Error: exactly 4 arguments are required" MC_dir = sys.argv[1] setting_dir = sys.argv[2] training_path = sys.argv[3] data_outpath = sys.argv[4] # files to which this discriminant should be augmented #data_files = ["ggH125", "VBFH125", "ZH125", "WplusH125", "WminusH125"] data_files = [ "ggH125", "VBFH125", "ZH125", "WplusH125", "WminusH125", "ttH125" ] #data_files = ["ggH125", "VBFH125", "ZH125", "WplusH125", "WminusH125", "ttH125", "ZZTo4l", "ggTo2e2mu_Contin_MCFM701", "ggTo2mu2tau_Contin_MCFM701", "ggTo4mu_Contin_MCFM701", "ggTo2e2tau_Contin_MCFM701", "ggTo4e_Contin_MCFM701", "ggTo4tau_Contin_MCFM701"] confhandler = ModelCollectionConfigFileHandler() confhandler.load_configuration(setting_dir + "settings.conf") mcolls = confhandler.GetModelCollection(weightpath=training_path) for data_file in data_files: augment_file(MC_dir, data_outpath, data_file, mcolls)
def main(): if len(sys.argv) != 5: print "Error: exactly 4 arguments is required" campaign_dir = sys.argv[1] input_config_file = sys.argv[2] hyperparam_config_file = sys.argv[3] workdir = sys.argv[4] mass_point = 125.0 run_dir = os.path.join(campaign_dir, "run") if not os.path.exists(run_dir): os.makedirs(run_dir) MC_path = os.path.join(workdir, "trainval/") # this always uses SimpleModel by default (more complicated models are accessible through ConfigFileSweeper when performing a sweep of network hyperparameters etc.) #mcoll = ModelFactoryFullMassRangeDynamicInclusive.GenerateSimpleModelCollections(MC_path, input_config_file = input_config_file, hyperparam_config_file = hyperparam_config_file, mass_point = mass_point) mcoll = SimpleModelFactoryDynamic.GenerateSimpleModelCollections( MC_path, input_config_file=input_config_file, hyperparam_config_file=hyperparam_config_file, mass_point=mass_point) mconfhandler = ModelCollectionConfigFileHandler() mconfhandler.ToConfiguration(mcoll) mconfhandler.save_configuration(os.path.join(run_dir, "settings.conf"))
def augment_config(mcoll, parent_dir, iterables): mconfhandler = ModelCollectionConfigFileHandler() mconfhandler.ToConfiguration(mcoll) outname = "" # augment the config object, given the values in the iterable dict for it in iterables.values(): values = it.cur() behaviours = it.auxs outname += it.to_strings()[0] # always the first one in a linked sweep determines the actual name of it section_names = it.names parameter_names = it.parameters # apply all the changes set forth in a (linked) sweep for value, behaviour, section_name, parameter_name in zip(values, behaviours, section_names, parameter_names): if isinstance(value, dict): if behaviour == 'replace': mconfhandler.SetDict(section_name, parameter_name, value, lambda x: str(x)) elif behaviour == 'append': mconfhandler.AddDictEntry(section_name, parameter_name, value, lambda x: float(x), lambda x: str(x)) elif isinstance(value, list): if behaviour == 'replace': mconfhandler.SetList(section_name, parameter_name, value, lambda x: x) elif behaviour == 'append': mconfhandler.AddListEntry(section_name, parameter_name, value, lambda x: x, lambda x: x) outpath = parent_dir + outname + "/" if not os.path.exists(outpath): print "creating " + outpath os.makedirs(outpath) mconfhandler.save_configuration(outpath + "settings.conf")
def main(): if len(sys.argv) != 5: print "Error: exactly 4 arguments are required" in_folder = sys.argv[1] out_folder = sys.argv[2] tree_name = sys.argv[3] run_dir = sys.argv[4] confhandler = ModelCollectionConfigFileHandler() confhandler.load_configuration(os.path.join(run_dir, "settings.conf")) mcolls = confhandler.GetModelCollection(weightpath = os.path.join(run_dir, "training/")) augment_file(in_folder, out_folder, tree_name, mcolls)
def main(): if len(sys.argv) != 3: print "Error: exactly 2 arguments are required" #/data_CMS/cms/wind/CJLST_NTuples/ #MC_dir = sys.argv[1] setting_dir = sys.argv[1] training_dir = sys.argv[2] confhandler = ModelCollectionConfigFileHandler() confhandler.load_configuration(setting_dir + "settings.conf") mcolls = confhandler.GetModelCollection() train = Trainer(training_dir) opt = optimizers.SGD(lr=0.01, momentum=0.9, decay=1e-6) #opt = optimizers.Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = K.epsilon(), decay = 0.0) for mcoll in mcolls: train.train(mcoll, optimizer=opt, MC_weighting=False)
def distribute_training_settings(run_path): # load the configuration that is sitting there confhandler = ModelCollectionConfigFileHandler() confhandler.load_configuration(run_path + "settings.conf") # these are all the model collections that need to be trained mcolls = confhandler.GetModelCollection() # create the folder holding the settings for the individual models and their training settings_dir = run_path + "settings_training/" if not os.path.exists(settings_dir): os.makedirs(settings_dir) # iterate over these models and make a separate config file for each of them for mcoll in mcolls: training_settings_dir = settings_dir + mcoll.name + "/" if not os.path.exists(training_settings_dir): os.makedirs(training_settings_dir) outconf = ModelCollectionConfigFileHandler() outconf.ToConfiguration([mcoll]) outconf.save_configuration(training_settings_dir + "settings.conf")
def main(): def _compute_class_weights_lengths(gen, preprocessor, MC_weighting=False): # determine the actual size of the available dataset and adjust the sample weights correspondingly H1_data = gen.H1_collection.get_data(Config.branches, 0.0, 1.0) H0_data = gen.H0_collection.get_data(Config.branches, 0.0, 1.0) H1_length = len(preprocessor.process(H1_data).values()[0]) H1_indices = preprocessor.get_last_indices() H0_length = len(preprocessor.process(H0_data).values()[0]) H0_indices = preprocessor.get_last_indices() print "H1_length = " + str(H1_length) print "H0_length = " + str(H0_length) # if per-sample weighting is enabled, also set up the normalization of the event weights if MC_weighting: H1_weight_sum = np.sum( np.maximum(np.array(H1_data["training_weight"][H1_indices]), 0.0)) H0_weight_sum = np.sum( np.maximum(np.array(H0_data["training_weight"][H0_indices]), 0.0)) H1_class_weight = float(H0_length) / H1_weight_sum H0_class_weight = float(H1_length) / H0_weight_sum else: # H1_class_weight = 1.0 # H0_class_weight = float(H1_length) / float(H0_length) H1_class_weight = 1.0 + float(H0_length) / float(H1_length) H0_class_weight = 1.0 + float(H1_length) / float(H0_length) return H1_class_weight, H0_class_weight, H1_length, H0_length # this computes low-level performance metrics for a model collection, i.e. the mean-quare error # computed on the validation dataset for each discriminant. Since the validation datasets will be held constant, # this is an easy way to directly compare different models setting_dir = sys.argv[1] training_dir = sys.argv[2] out_dir = sys.argv[3] # first, need to read in the trained ModelCollection: mconfhandler = ModelCollectionConfigFileHandler() mconfhandler.load_configuration(setting_dir + "settings.conf") mcolls = mconfhandler.GetModelCollection(weightpath=training_dir) confhandler = ConfigFileHandler() out_path = out_dir + "model_benchmark.txt" # for the evaluation, need to proceed in the same way as for training, but evaluate the models on the validation # data instead of training them on the training data for mcoll in mcolls: models, preprocessors, settings = mcoll.get_models( ), mcoll.get_preprocessors(), mcoll.get_settings() for cur_model, cur_preprocessor, cur_settings in zip( models, preprocessors, settings): val_gen = Generator(mcoll.H1_stream, mcoll.H0_stream, Config.branches, preprocessor=cur_preprocessor, chunks=1, MC_weighting=False) val_gen.setup_validation_data() val_H1_classweight, val_H0_classweight, H1_length, H0_length = _compute_class_weights_lengths( val_gen, cur_preprocessor, False) print val_H1_classweight print val_H0_classweight print H1_length print H0_length val_gen.set_H1_weight(val_H1_classweight) val_gen.set_H0_weight(val_H0_classweight) val_gen.set_minimum_length(0) cur_model.get_keras_model().compile(optimizer=optimizers.Adam(), loss="mean_squared_error", metrics=["binary_accuracy"]) res = cur_model.get_keras_model().evaluate_generator( val_gen.preprocessed_generator(), steps=1) print "statistics for model " + cur_model.name print res print cur_model.get_keras_model().metrics_names confhandler.new_section(cur_model.name) confhandler.set_field(cur_model.name, 'H0_val_length', str(H0_length)) confhandler.set_field(cur_model.name, 'H1_val_length', str(H1_length)) confhandler.set_field(cur_model.name, 'val_loss', str(res[0])) confhandler.save_configuration(out_path)
def main(): # runs to check for (good) models (the first one passed is taken as reference run from which the available models # are taken - it is expected that all others runs also follow this structure): input_runs = [] print "===================================================================" print "looking for models in the following runs:" for campaign_dir in sys.argv[1:-2]: for run_dir in next(os.walk(campaign_dir))[1]: if not "bin" in run_dir: run_path = os.path.join(campaign_dir, run_dir) print run_path input_runs.append(run_path) print "===================================================================" # output training campaign, this will consist of a combination of the models found in the campaigns listed above, in such a way that the overall performance is optimized output_run = os.path.join(sys.argv[-1], "optimized") # where the configuration file for the hyperparameter settings should be stored hyperparam_output = os.path.join(output_run, "../hyperparameters.conf") os.makedirs(output_run) # load the available model names reference_run = input_runs[0] available_mcolls = os.walk(os.path.join(reference_run, "training")).next()[1] mcolls_winning = [] for mcoll in available_mcolls: models = os.walk(os.path.join(reference_run, "training", mcoll)).next()[1] # load a representative version of the current model collection... mconfhandler = ModelCollectionConfigFileHandler() mconfhandler.load_configuration( os.path.join(reference_run, "settings_training", mcoll, "settings.conf")) mcoll_template = mconfhandler.GetModelCollection()[0] # ... but strip away all the actual model components mcoll_template.model_dict = {} mcoll_template.preprocessor_dict = {} mcoll_template.settings_dict = {} for model in models: # compare this model across the different runs losses = [get_loss(run, mcoll, model) for run in input_runs] winner = np.argmin(losses) winning_run = input_runs[winner] # copy the winning model into the output run shutil.copytree( os.path.join(winning_run, "training", mcoll, model), os.path.join(output_run, "training", mcoll, model)) print "--------------------------------------------" print " take " + model + " from " + winning_run print "--------------------------------------------" # load the winning model to keep track of its settings mconfhandler = ModelCollectionConfigFileHandler() mconfhandler.load_configuration( os.path.join(winning_run, "settings_training", mcoll, "settings.conf")) mcoll_winning = mconfhandler.GetModelCollection()[0] # then pull the winning model over into the template winning_model = mcoll_winning.model_dict[model] winning_preprocessor = mcoll_winning.preprocessor_dict[model] winning_settings = mcoll_winning.settings_dict[model] mcoll_template.add_model(winning_preprocessor, winning_model, winning_settings) mcolls_winning.append(mcoll_template) # now save the put-together config file also into the output run mconfhandler = ModelCollectionConfigFileHandler() mconfhandler.ToConfiguration(mcolls_winning) mconfhandler.save_configuration(os.path.join(output_run, "settings.conf")) # now distriute again the training settings, as usual: distribute_training_settings(output_run + '/') # now create the hyperparameter config file for each model, taken from the winners hp_confhandler = ConfigFileHandler() for mcoll in mcolls_winning: for model_name, model in mcoll.model_dict.iteritems(): hp_confhandler.new_section(model_name) hp_confhandler.set_field( model_name, "hyperparameters", ConfigFileUtils.serialize_dict(model.hyperparameters, lambda x: str(x))) hp_confhandler.save_configuration(hyperparam_output) print "===================================================================" print "hyperparameter configuration file written to " + hyperparam_output print "==================================================================="
def main(): if len(sys.argv) != 2: print "Error: exactly 1 argument is required" campaign_dir = sys.argv[1] bin_dir = {"Untagged": 0, "VBF1j": 1, "VBF2j": 2, "VHhadr": 3} def format_parameter_list(inlist): outstring = "" linewidth = 0 for parameter in inlist: if "D_" in parameter: newstring = "MELA, " else: newstring = parameter + ", " outstring += newstring linewidth += len(newstring) if linewidth > 20: outstring += "\n" linewidth = 0 return outstring[:-2] df = pd.DataFrame() for subdir in next(os.walk(campaign_dir))[1]: if "statistics" not in subdir: values = {} punzi_path = campaign_dir + subdir + "/comp/Mor18_punzi_comp.conf" settings_path = campaign_dir + subdir + "/settings.conf" # first, read back the configuration file for this run conf = ModelCollectionConfigFileHandler() conf.LoadConfiguration(settings_path) # now select a typical model and read its hyperparameters typical_model = conf._get_model_list(conf._get_model_collection_list()[0])[0] hyperparams = conf.GetHyperparameters(typical_model) hyperparam_dict = {key: [val] for key, val in hyperparams.iteritems()} # also read the list of input parameters that have been fed into the network param_list = conf.GetInputParameterList(typical_model) values['input_columns'] = [format_parameter_list(param_list)] values['number_inputs'] = len(param_list) # then read in the results in terms of relative Punzi improvement for each category conf = ConfigFileHandler() conf.LoadConfiguration(punzi_path) # load the Punzi values for each category for category, bin_number in bin_dir.iteritems(): values[category] = float(conf._get_field("Punzi", category)) # merge the two dictionaries values.update(hyperparam_dict) df = df.append(pd.DataFrame.from_dict(values)) statistics_dir = campaign_dir + "statistics/" if not os.path.exists(statistics_dir): os.makedirs(statistics_dir) punzi_data = df[bin_dir.keys()].as_matrix() punzi_data = np.transpose(punzi_data) inparam_labels = df['input_columns'].as_matrix() plt.figure(figsize = (8, 9)) plt.imshow(punzi_data, interpolation = 'none', cmap = 'RdYlGn', aspect = 0.6, vmin = 0.8, vmax = 1.2) plt.colorbar() plt.yticks(range(len(bin_dir)), bin_dir.keys()) plt.xticks(range(len(df)), inparam_labels, rotation = 'vertical') plt.title("Punzi purity ratio") #plt.tight_layout() plt.savefig(statistics_dir + "punzi.pdf", bbox_inches = 'tight')
def main(): if len(sys.argv) != 2: print "Error: exactly 1 argument is required" campaign_dir = sys.argv[1] bin_dir = {"Untagged": 0, "VBF1j": 1, "VBF2j": 2, "VHhadr": 3} df = pd.DataFrame() for subdir in next(os.walk(campaign_dir))[1]: if "statistics" not in subdir: punzi_path = campaign_dir + subdir + "/comp/Mor18_punzi_comp.conf" settings_path = campaign_dir + subdir + "/settings.conf" # first, read back the configuration file for this run conf = ModelCollectionConfigFileHandler() conf.LoadConfiguration(settings_path) # now select a typical model and read its hyperparameters typical_model = conf._get_model_list( conf._get_model_collection_list()[0])[0] hyperparams = conf.GetHyperparameters(typical_model) hyperparam_dict = { key: [val] for key, val in hyperparams.iteritems() } # then read in the results in terms of relative Punzi improvement for each category conf = ConfigFileHandler() conf.LoadConfiguration(punzi_path) # load the Punzi values for each category values = {} for category, bin_number in bin_dir.iteritems(): values[category] = float(conf._get_field("Punzi", category)) # merge the two dictionaries values.update(hyperparam_dict) df = df.append(pd.DataFrame.from_dict(values)) # all different values of the number of neurons that were used in the sweep number_neurons = set(df['number_neurons']) statistics_dir = campaign_dir + "statistics/" if not os.path.exists(statistics_dir): os.makedirs(statistics_dir) # sort the pandas dataframe ascending by number of hidden layers df = df.sort_values("number_layers") for num in number_neurons: number_layers = df.loc[df["number_neurons"] == num, ["number_layers"]].as_matrix().flatten().astype( int) punzi_data = df.loc[df["number_neurons"] == num, bin_dir.keys()].as_matrix() plt.figure() plt.imshow(punzi_data, interpolation='none', cmap='RdYlGn', aspect=0.6, vmin=0.8, vmax=1.2) plt.xticks(range(len(bin_dir)), bin_dir.keys()) plt.yticks(range(len(number_layers)), number_layers) plt.ylabel("number hidden layers") plt.colorbar() plt.title("Punzi purity ratio [" + str(int(num)) + " hidden neurons]") plt.savefig(statistics_dir + "punzi_" + str(int(num)) + "_hidden_neurons.pdf")