コード例 #1
0
def main():

    if len(sys.argv) != 5:
        print "Error: exactly 4 arguments are required"

    MC_dir = sys.argv[1]
    setting_dir = sys.argv[2]
    training_path = sys.argv[3]
    data_outpath = sys.argv[4]

    # files to which this discriminant should be augmented
    #data_files = ["ggH125", "VBFH125", "ZH125", "WplusH125", "WminusH125"]

    data_files = [
        "ggH125", "VBFH125", "ZH125", "WplusH125", "WminusH125", "ttH125"
    ]

    #data_files = ["ggH125", "VBFH125", "ZH125", "WplusH125", "WminusH125", "ttH125", "ZZTo4l", "ggTo2e2mu_Contin_MCFM701", "ggTo2mu2tau_Contin_MCFM701", "ggTo4mu_Contin_MCFM701", "ggTo2e2tau_Contin_MCFM701", "ggTo4e_Contin_MCFM701", "ggTo4tau_Contin_MCFM701"]

    confhandler = ModelCollectionConfigFileHandler()
    confhandler.load_configuration(setting_dir + "settings.conf")
    mcolls = confhandler.GetModelCollection(weightpath=training_path)

    for data_file in data_files:
        augment_file(MC_dir, data_outpath, data_file, mcolls)
コード例 #2
0
def main():

    if len(sys.argv) != 5:
        print "Error: exactly 4 arguments is required"

    campaign_dir = sys.argv[1]
    input_config_file = sys.argv[2]
    hyperparam_config_file = sys.argv[3]
    workdir = sys.argv[4]

    mass_point = 125.0

    run_dir = os.path.join(campaign_dir, "run")

    if not os.path.exists(run_dir):
        os.makedirs(run_dir)

    MC_path = os.path.join(workdir, "trainval/")

    # this always uses SimpleModel by default (more complicated models are accessible through ConfigFileSweeper when performing a sweep of network hyperparameters etc.)
    #mcoll = ModelFactoryFullMassRangeDynamicInclusive.GenerateSimpleModelCollections(MC_path, input_config_file = input_config_file, hyperparam_config_file = hyperparam_config_file, mass_point = mass_point)
    mcoll = SimpleModelFactoryDynamic.GenerateSimpleModelCollections(
        MC_path,
        input_config_file=input_config_file,
        hyperparam_config_file=hyperparam_config_file,
        mass_point=mass_point)

    mconfhandler = ModelCollectionConfigFileHandler()
    mconfhandler.ToConfiguration(mcoll)

    mconfhandler.save_configuration(os.path.join(run_dir, "settings.conf"))
コード例 #3
0
def augment_config(mcoll, parent_dir, iterables):
    mconfhandler = ModelCollectionConfigFileHandler()
    mconfhandler.ToConfiguration(mcoll)
    
    outname = ""
    
    # augment the config object, given the values in the iterable dict
    for it in iterables.values():
        values = it.cur()
        behaviours = it.auxs
        outname += it.to_strings()[0] # always the first one in a linked sweep determines the actual name of it
        section_names = it.names
        parameter_names = it.parameters
        
        # apply all the changes set forth in a (linked) sweep
        for value, behaviour, section_name, parameter_name in zip(values, behaviours, section_names, parameter_names):
            if isinstance(value, dict):
                if behaviour == 'replace':
                    mconfhandler.SetDict(section_name, parameter_name, value, lambda x: str(x))
                elif behaviour == 'append':
                    mconfhandler.AddDictEntry(section_name, parameter_name, value, lambda x: float(x), lambda x: str(x))
            elif isinstance(value, list):
                if behaviour == 'replace':
                    mconfhandler.SetList(section_name, parameter_name, value, lambda x: x)
                elif behaviour == 'append':
                    mconfhandler.AddListEntry(section_name, parameter_name, value, lambda x: x, lambda x: x)

    outpath = parent_dir + outname + "/"
    
    if not os.path.exists(outpath):
        print "creating " + outpath
        os.makedirs(outpath)
        
    mconfhandler.save_configuration(outpath + "settings.conf")    
コード例 #4
0
def main():

    if len(sys.argv) != 5:
        print "Error: exactly 4 arguments are required"

    in_folder = sys.argv[1]
    out_folder = sys.argv[2]
    tree_name = sys.argv[3]
    run_dir = sys.argv[4]

    confhandler = ModelCollectionConfigFileHandler()
    confhandler.load_configuration(os.path.join(run_dir, "settings.conf"))
    mcolls = confhandler.GetModelCollection(weightpath = os.path.join(run_dir, "training/"))

    augment_file(in_folder, out_folder, tree_name, mcolls)
コード例 #5
0
def main():

    if len(sys.argv) != 3:
        print "Error: exactly 2 arguments are required"

    #/data_CMS/cms/wind/CJLST_NTuples/
    #MC_dir = sys.argv[1]
    setting_dir = sys.argv[1]
    training_dir = sys.argv[2]

    confhandler = ModelCollectionConfigFileHandler()
    confhandler.load_configuration(setting_dir + "settings.conf")
    mcolls = confhandler.GetModelCollection()

    train = Trainer(training_dir)
    opt = optimizers.SGD(lr=0.01, momentum=0.9, decay=1e-6)
    #opt = optimizers.Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = K.epsilon(), decay = 0.0)

    for mcoll in mcolls:
        train.train(mcoll, optimizer=opt, MC_weighting=False)
コード例 #6
0
def distribute_training_settings(run_path):

    # load the configuration that is sitting there
    confhandler = ModelCollectionConfigFileHandler()
    confhandler.load_configuration(run_path + "settings.conf")

    # these are all the model collections that need to be trained
    mcolls = confhandler.GetModelCollection()

    # create the folder holding the settings for the individual models and their training
    settings_dir = run_path + "settings_training/"
    if not os.path.exists(settings_dir):
        os.makedirs(settings_dir)

    # iterate over these models and make a separate config file for each of them
    for mcoll in mcolls:
        training_settings_dir = settings_dir + mcoll.name + "/"
    
        if not os.path.exists(training_settings_dir):
            os.makedirs(training_settings_dir)
    
        outconf = ModelCollectionConfigFileHandler()
        outconf.ToConfiguration([mcoll])
        outconf.save_configuration(training_settings_dir + "settings.conf")
コード例 #7
0
def main():
    def _compute_class_weights_lengths(gen, preprocessor, MC_weighting=False):
        # determine the actual size of the available dataset and adjust the sample weights correspondingly
        H1_data = gen.H1_collection.get_data(Config.branches, 0.0, 1.0)
        H0_data = gen.H0_collection.get_data(Config.branches, 0.0, 1.0)
        H1_length = len(preprocessor.process(H1_data).values()[0])
        H1_indices = preprocessor.get_last_indices()
        H0_length = len(preprocessor.process(H0_data).values()[0])
        H0_indices = preprocessor.get_last_indices()

        print "H1_length = " + str(H1_length)
        print "H0_length = " + str(H0_length)

        # if per-sample weighting is enabled, also set up the normalization of the event weights
        if MC_weighting:
            H1_weight_sum = np.sum(
                np.maximum(np.array(H1_data["training_weight"][H1_indices]),
                           0.0))
            H0_weight_sum = np.sum(
                np.maximum(np.array(H0_data["training_weight"][H0_indices]),
                           0.0))

            H1_class_weight = float(H0_length) / H1_weight_sum
            H0_class_weight = float(H1_length) / H0_weight_sum
        else:
            # H1_class_weight = 1.0
            # H0_class_weight = float(H1_length) / float(H0_length)
            H1_class_weight = 1.0 + float(H0_length) / float(H1_length)
            H0_class_weight = 1.0 + float(H1_length) / float(H0_length)

        return H1_class_weight, H0_class_weight, H1_length, H0_length

    # this computes low-level performance metrics for a model collection, i.e. the mean-quare error
    # computed on the validation dataset for each discriminant. Since the validation datasets will be held constant,
    # this is an easy way to directly compare different models

    setting_dir = sys.argv[1]
    training_dir = sys.argv[2]
    out_dir = sys.argv[3]

    # first, need to read in the trained ModelCollection:
    mconfhandler = ModelCollectionConfigFileHandler()
    mconfhandler.load_configuration(setting_dir + "settings.conf")
    mcolls = mconfhandler.GetModelCollection(weightpath=training_dir)

    confhandler = ConfigFileHandler()
    out_path = out_dir + "model_benchmark.txt"

    # for the evaluation, need to proceed in the same way as for training, but evaluate the models on the validation
    # data instead of training them on the training data

    for mcoll in mcolls:
        models, preprocessors, settings = mcoll.get_models(
        ), mcoll.get_preprocessors(), mcoll.get_settings()

        for cur_model, cur_preprocessor, cur_settings in zip(
                models, preprocessors, settings):
            val_gen = Generator(mcoll.H1_stream,
                                mcoll.H0_stream,
                                Config.branches,
                                preprocessor=cur_preprocessor,
                                chunks=1,
                                MC_weighting=False)
            val_gen.setup_validation_data()
            val_H1_classweight, val_H0_classweight, H1_length, H0_length = _compute_class_weights_lengths(
                val_gen, cur_preprocessor, False)
            print val_H1_classweight
            print val_H0_classweight
            print H1_length
            print H0_length
            val_gen.set_H1_weight(val_H1_classweight)
            val_gen.set_H0_weight(val_H0_classweight)
            val_gen.set_minimum_length(0)
            cur_model.get_keras_model().compile(optimizer=optimizers.Adam(),
                                                loss="mean_squared_error",
                                                metrics=["binary_accuracy"])
            res = cur_model.get_keras_model().evaluate_generator(
                val_gen.preprocessed_generator(), steps=1)
            print "statistics for model " + cur_model.name
            print res
            print cur_model.get_keras_model().metrics_names

            confhandler.new_section(cur_model.name)
            confhandler.set_field(cur_model.name, 'H0_val_length',
                                  str(H0_length))
            confhandler.set_field(cur_model.name, 'H1_val_length',
                                  str(H1_length))
            confhandler.set_field(cur_model.name, 'val_loss', str(res[0]))

    confhandler.save_configuration(out_path)
コード例 #8
0
def main():
    # runs to check for (good) models (the first one passed is taken as reference run from which the available models
    # are taken - it is expected that all others runs also follow this structure):
    input_runs = []

    print "==================================================================="
    print "looking for models in the following runs:"

    for campaign_dir in sys.argv[1:-2]:
        for run_dir in next(os.walk(campaign_dir))[1]:
            if not "bin" in run_dir:
                run_path = os.path.join(campaign_dir, run_dir)
                print run_path
                input_runs.append(run_path)

    print "==================================================================="

    # output training campaign, this will consist of a combination of the models found in the campaigns listed above, in such a way that the overall performance is optimized
    output_run = os.path.join(sys.argv[-1], "optimized")

    # where the configuration file for the hyperparameter settings should be stored
    hyperparam_output = os.path.join(output_run, "../hyperparameters.conf")

    os.makedirs(output_run)

    # load the available model names
    reference_run = input_runs[0]
    available_mcolls = os.walk(os.path.join(reference_run,
                                            "training")).next()[1]

    mcolls_winning = []

    for mcoll in available_mcolls:
        models = os.walk(os.path.join(reference_run, "training",
                                      mcoll)).next()[1]

        # load a representative version of the current model collection...
        mconfhandler = ModelCollectionConfigFileHandler()
        mconfhandler.load_configuration(
            os.path.join(reference_run, "settings_training", mcoll,
                         "settings.conf"))
        mcoll_template = mconfhandler.GetModelCollection()[0]

        # ... but strip away all the actual model components
        mcoll_template.model_dict = {}
        mcoll_template.preprocessor_dict = {}
        mcoll_template.settings_dict = {}

        for model in models:
            # compare this model across the different runs
            losses = [get_loss(run, mcoll, model) for run in input_runs]

            winner = np.argmin(losses)

            winning_run = input_runs[winner]

            # copy the winning model into the output run
            shutil.copytree(
                os.path.join(winning_run, "training", mcoll, model),
                os.path.join(output_run, "training", mcoll, model))

            print "--------------------------------------------"
            print " take " + model + " from " + winning_run
            print "--------------------------------------------"

            # load the winning model to keep track of its settings
            mconfhandler = ModelCollectionConfigFileHandler()
            mconfhandler.load_configuration(
                os.path.join(winning_run, "settings_training", mcoll,
                             "settings.conf"))
            mcoll_winning = mconfhandler.GetModelCollection()[0]

            # then pull the winning model over into the template
            winning_model = mcoll_winning.model_dict[model]
            winning_preprocessor = mcoll_winning.preprocessor_dict[model]
            winning_settings = mcoll_winning.settings_dict[model]

            mcoll_template.add_model(winning_preprocessor, winning_model,
                                     winning_settings)

        mcolls_winning.append(mcoll_template)

    # now save the put-together config file also into the output run
    mconfhandler = ModelCollectionConfigFileHandler()
    mconfhandler.ToConfiguration(mcolls_winning)
    mconfhandler.save_configuration(os.path.join(output_run, "settings.conf"))

    # now distriute again the training settings, as usual:
    distribute_training_settings(output_run + '/')

    # now create the hyperparameter config file for each model, taken from the winners
    hp_confhandler = ConfigFileHandler()
    for mcoll in mcolls_winning:
        for model_name, model in mcoll.model_dict.iteritems():
            hp_confhandler.new_section(model_name)
            hp_confhandler.set_field(
                model_name, "hyperparameters",
                ConfigFileUtils.serialize_dict(model.hyperparameters,
                                               lambda x: str(x)))

    hp_confhandler.save_configuration(hyperparam_output)

    print "==================================================================="
    print "hyperparameter configuration file written to " + hyperparam_output
    print "==================================================================="
def main():

    if len(sys.argv) != 2:
        print "Error: exactly 1 argument is required"

    campaign_dir = sys.argv[1]

    bin_dir = {"Untagged": 0,
               "VBF1j": 1,
               "VBF2j": 2,
               "VHhadr": 3}

    def format_parameter_list(inlist):
        outstring = ""
        linewidth = 0
    
        for parameter in inlist:
            if "D_" in parameter:
                newstring = "MELA, "
            else:
                newstring = parameter + ", "
            
            outstring += newstring
            linewidth += len(newstring)
        
            if linewidth > 20:
                outstring += "\n"
                linewidth = 0
            
        return outstring[:-2]

    df = pd.DataFrame()

    for subdir in next(os.walk(campaign_dir))[1]:
        if "statistics" not in subdir:
            values = {}

            punzi_path = campaign_dir + subdir + "/comp/Mor18_punzi_comp.conf"
            settings_path = campaign_dir + subdir + "/settings.conf"
    
            # first, read back the configuration file for this run
            conf = ModelCollectionConfigFileHandler()
            conf.LoadConfiguration(settings_path)
    
            # now select a typical model and read its hyperparameters
            typical_model = conf._get_model_list(conf._get_model_collection_list()[0])[0]
            hyperparams = conf.GetHyperparameters(typical_model)
            hyperparam_dict = {key: [val] for key, val in hyperparams.iteritems()}
        
            # also read the list of input parameters that have been fed into the network
            param_list = conf.GetInputParameterList(typical_model)
            values['input_columns'] = [format_parameter_list(param_list)]
            values['number_inputs'] = len(param_list)
            
            # then read in the results in terms of relative Punzi improvement for each category
            conf = ConfigFileHandler()
            conf.LoadConfiguration(punzi_path)
    
            # load the Punzi values for each category
            for category, bin_number in bin_dir.iteritems():
                values[category] = float(conf._get_field("Punzi", category))
        
            # merge the two dictionaries
            values.update(hyperparam_dict)
            df = df.append(pd.DataFrame.from_dict(values))

    statistics_dir = campaign_dir + "statistics/"
    if not os.path.exists(statistics_dir):
        os.makedirs(statistics_dir)

    punzi_data = df[bin_dir.keys()].as_matrix()
    punzi_data = np.transpose(punzi_data)

    inparam_labels = df['input_columns'].as_matrix()

    plt.figure(figsize = (8, 9))
    plt.imshow(punzi_data, interpolation = 'none', cmap = 'RdYlGn', aspect = 0.6, vmin = 0.8, vmax = 1.2)
    plt.colorbar()
    plt.yticks(range(len(bin_dir)), bin_dir.keys())
    plt.xticks(range(len(df)), inparam_labels, rotation = 'vertical')
    plt.title("Punzi purity ratio")
    #plt.tight_layout()
    plt.savefig(statistics_dir + "punzi.pdf", bbox_inches = 'tight')
コード例 #10
0
def main():

    if len(sys.argv) != 2:
        print "Error: exactly 1 argument is required"

    campaign_dir = sys.argv[1]

    bin_dir = {"Untagged": 0, "VBF1j": 1, "VBF2j": 2, "VHhadr": 3}

    df = pd.DataFrame()

    for subdir in next(os.walk(campaign_dir))[1]:
        if "statistics" not in subdir:
            punzi_path = campaign_dir + subdir + "/comp/Mor18_punzi_comp.conf"
            settings_path = campaign_dir + subdir + "/settings.conf"

            # first, read back the configuration file for this run
            conf = ModelCollectionConfigFileHandler()
            conf.LoadConfiguration(settings_path)

            # now select a typical model and read its hyperparameters
            typical_model = conf._get_model_list(
                conf._get_model_collection_list()[0])[0]
            hyperparams = conf.GetHyperparameters(typical_model)
            hyperparam_dict = {
                key: [val]
                for key, val in hyperparams.iteritems()
            }

            # then read in the results in terms of relative Punzi improvement for each category
            conf = ConfigFileHandler()
            conf.LoadConfiguration(punzi_path)

            # load the Punzi values for each category
            values = {}
            for category, bin_number in bin_dir.iteritems():
                values[category] = float(conf._get_field("Punzi", category))

            # merge the two dictionaries
            values.update(hyperparam_dict)
            df = df.append(pd.DataFrame.from_dict(values))

    # all different values of the number of neurons that were used in the sweep
    number_neurons = set(df['number_neurons'])

    statistics_dir = campaign_dir + "statistics/"
    if not os.path.exists(statistics_dir):
        os.makedirs(statistics_dir)

    # sort the pandas dataframe ascending by number of hidden layers
    df = df.sort_values("number_layers")

    for num in number_neurons:
        number_layers = df.loc[df["number_neurons"] == num,
                               ["number_layers"]].as_matrix().flatten().astype(
                                   int)
        punzi_data = df.loc[df["number_neurons"] == num,
                            bin_dir.keys()].as_matrix()

        plt.figure()
        plt.imshow(punzi_data,
                   interpolation='none',
                   cmap='RdYlGn',
                   aspect=0.6,
                   vmin=0.8,
                   vmax=1.2)
        plt.xticks(range(len(bin_dir)), bin_dir.keys())
        plt.yticks(range(len(number_layers)), number_layers)
        plt.ylabel("number hidden layers")
        plt.colorbar()
        plt.title("Punzi purity ratio [" + str(int(num)) + " hidden neurons]")
        plt.savefig(statistics_dir + "punzi_" + str(int(num)) +
                    "_hidden_neurons.pdf")