Ejemplo n.º 1
0
 def save_last_training_summary(self, path):
     """
     Dumps summary of the most recent training to a summary file.
     """
     summary_dict = {
         'training_output_path': self.training_output_path,
         'qcd_path': self.qcd_path,
         'hlf': True,
         'hlf_to_drop': tuple(self.hlf_to_drop),
         'eflow': True,
         'eflow_base': self.get_EFP_base(),
         'test_split': self.test_data_fraction,
         'val_split': self.validation_data_fraction,
         'norm_type': self.norm_type,
         'norm_args': self.norm_args,
         'norm_means_train': self.means_train,
         'norm_stds_train': self.stds_train,
         'norm_means_validation': self.means_validation,
         'norm_stds_validation': self.stds_validation,
         'range': self.data_ranges.tolist(),
         'target_dim': self.bottleneck_size,
         'input_dim': self.input_size,
         'arch': self.get_architecture_summary(),
         'seed': self.seed,
         'start_time': str(self.start_timestamp),
         'end_time': str(self.end_timestamp),
     }
     summaryProcessor.dump_summary_json(self.training_params,
                                        summary_dict,
                                        output_path=path)
Ejemplo n.º 2
0
def get_summary():

    summaries = summaryProcessor.get_summaries_from_path(config.summary_path)

    summary = None

    for _, s in summaries.df.iterrows():
        version = summaryProcessor.get_version(s.summary_path)
        if version != config.best_model:
            continue
        summary = s

    return summary
Ejemplo n.º 3
0
    def save_aucs(self, summary_path, AUCs_path, **kwargs):

        summaries = summaryProcessor.get_summaries_from_path(summary_path)

        if not os.path.exists(AUCs_path):
            Path(AUCs_path).mkdir(parents=True, exist_ok=False)

        for _, summary in summaries.df.iterrows():

            filename = summary.training_output_path.split("/")[-1]
            utils.set_random_seed(summary.seed)
            data_processor = DataProcessor(summary=summary)
            data_loader = self.__get_data_loader(summary)

            auc_params = self.model_evaluator.get_aucs(
                summary=summary,
                AUCs_path=AUCs_path,
                filename=filename,
                data_processor=data_processor,
                data_loader=data_loader,
                **kwargs)
            if auc_params is None:
                continue

            (aucs, auc_path, append, write_header) = auc_params
            self.__save_aucs_to_csv(aucs=aucs,
                                    path=auc_path,
                                    append=append,
                                    write_header=write_header)
Ejemplo n.º 4
0
def get_summary():
    summary_path = config.summary_path + config.file_name + "_v" + str(config.best_model) + ".summary"
    print("Loading summary from path: ", summary_path)
    summary = summaryProcessor.get_summary_from_path(summary_path)
    
    for _, s in summary.df.iterrows():
        summary = s
    
    return summary
def get_evaluator(scaler_type, training_version):
    summaries_path = "trainingResults/summary/{}/".format(scaler_type)
    summary_base_name = "hlf_eflow{}_{}_".format(efp_base, bottleneck_dim)
    input_summary_path = summaryProcessor.get_latest_summary_file_path(summaries_path=summaries_path,
                                                                       file_name_base=summary_base_name,
                                                                       version=training_version)
    
    signals = get_signals()
    return AutoEncoderEvaluator(input_summary_path, signals=signals)
Ejemplo n.º 6
0
    def train(self, summaries_path):
        """
        Runs the training using specialized trainer object.
        Saves the model and summary
        """

        print("\n\nTraining the model")
        self.start_timestamp = datetime.datetime.now()
        self.model_trainer.train()
        self.end_timestamp = datetime.datetime.now()
        print("Training executed in: ",
              (self.end_timestamp - self.start_timestamp), " s")

        self.__save_model()

        summary_dict = self.model_trainer.get_summary()
        summary_dict = {**summary_dict, **self.__get_summary()}
        summaryProcessor.dump_summary_json(summary_dict,
                                           output_path=summaries_path)
Ejemplo n.º 7
0
    def get_filename(self, summaries_path):
        """
        Returns filename for given QCD sample, already with correct next version deduced
        from contents of the provided summaries directory
        """

        filename = "hlf_eflow{}_{}_".format(self.get_EFP_base(),
                                            self.bottleneck_size)
        last_version = summaryProcessor.get_last_summary_file_version(
            summaries_path, filename)
        filename += "v{}".format(last_version + 1)

        return filename
Ejemplo n.º 8
0
    def draw_roc_curves(self, summary_path, summary_version, **kwargs):

        summaries = summaryProcessor.get_summaries_from_path(summary_path)

        plotting_args = {
            k: v
            for k, v in kwargs.items()
            if k not in ["signals", "signals_base_path"]
        }

        fig, ax_begin, ax_end, plt_end, colors = self.__get_plot_params(
            n_plots=1, **plotting_args)
        ax = ax_begin(0)

        for _, summary in summaries.df.iterrows():
            version = summaryProcessor.get_version(summary.summary_path)
            if version != summary_version:
                continue

            utils.set_random_seed(summary.seed)
            kwargs["filename"] = summary.training_output_path.split("/")[-1]
            data_processor = DataProcessor(summary=summary)
            data_loader = self.__get_data_loader(summary)

            self.model_evaluator.draw_roc_curves(summary=summary,
                                                 data_processor=data_processor,
                                                 data_loader=data_loader,
                                                 ax=ax,
                                                 colors=colors,
                                                 **kwargs)

        x = [i for i in np.arange(0, 1.1, 0.1)]
        ax.plot(x, x, '--', c='black')
        ax_end("false positive rate", "true positive rate")
        plt_end()
        plt.show()
Ejemplo n.º 9
0
for f in glob.glob(AUCs_path):
    data_elt = pd.read_csv(f)
    file_elt = str(f.split('/')[-1])
    data_elt['name'] = file_elt
    auc_dict[file_elt] = data_elt
    
print("AUC dict: ", auc_dict)

aucs = pd.concat(auc_dict)
aucs['mass_nu_ratio'] = list(zip(aucs.mass, aucs.nu))
aucs = aucs.pivot('mass_nu_ratio', 'name', 'auc')



summaries = summaryProcessor.summary(summary_path=summaries_path)

model_acceptance_fraction = 10  # take top N best performing models
# take lowest 10% losses of all trainings
# n_best = int(0.01 * model_acceptance_fraction * summaries.data.size)
# best_ = summaries.sort_values('total_loss').head(n_best)
# best_name = str(best_.filename.values[0])
#
# print("N best: ", n_best)
# print("Best models: ", best_)
# print("The best model: ", best_name)



AUC_file_name = "hlf_eflow{}_{}_v{}".format(efp_base, bottleneck_dim, training_version[scaler_type])
Ejemplo n.º 10
0
import module.SummaryProcessor as summaryProcessor

# ------------------------------------------------------------------------------------------------
# This script will produce a CSV file with areas under ROC curves (AUCs) for each training
# summary file found in the "summary_path" below, testing on all signal samples found
# in "input_path". The output will be stored in "aucs_path".
# ------------------------------------------------------------------------------------------------

summary_path = "trainingResults/summary/"
input_path = "../../data/s_channel_delphes/h5_signal_no_MET_over_mt_cut/*.h5"

AUCs_path = "trainingResults/aucs/"

summaryProcessor.save_all_missing_AUCs(summary_path=summary_path,
                                       signals_path=input_path,
                                       AUCs_path=AUCs_path)
Ejemplo n.º 11
0
scaler_type = "customStandardScaler"

training_version = {
    "standardScaler": 8,
    "customScaler": 47,
    "robustScaler": 63,
    "customStandardScaler": 86
}

efp_base = 3
bottleneck_dim = 8
summaries_path = "trainingResults/summary/{}/".format(scaler_type)
summary_base_name = "hlf_eflow{}_{}_".format(efp_base, bottleneck_dim)

input_summary_path = summaryProcessor.get_latest_summary_file_path(
    summaries_path=summaries_path,
    file_name_base=summary_base_name,
    version=training_version[scaler_type])

# masses = [1500, 2000, 2500, 3000, 3500, 4000]
masses = [2000]
rinvs = [0.15, 0.30, 0.45, 0.60, 0.75]

signals_base_path = "../../data/training_data/all_signals/"

signals = {
    "{}, {}".format(mass, rinv):
    "{}{}GeV_{:1.2f}/base_3/*.h5".format(signals_base_path, mass, rinv)
    for mass in masses for rinv in rinvs
}

print("\n\nDraing ROC curves for summary: ", input_summary_path)
Ejemplo n.º 12
0
# will be used.
# ------------------------------------------------------------------------------------------------

parser = argparse.ArgumentParser(description="Argument parser")
parser.add_argument("-c",
                    "--config",
                    dest="config_path",
                    default=None,
                    required=True,
                    help="Path to the config file")
args = parser.parse_args()
config_path = args.config_path.strip(".py").replace("/", ".")
config = importlib.import_module(config_path)

input_summary_path = summaryProcessor.get_latest_summary_file_path(
    summaries_path=config.summary_path,
    file_name_base=config.file_name,
    version=config.best_model)

# masses = [1500, 2000, 2500, 3000, 3500, 4000]
masses = [2500]
# rinvs = [0.15, 0.30, 0.45, 0.60, 0.75]
rinvs = [0.45]

signals = {
    "mZprime{}_mDark20_rinv{}_alphapeak".format(mass, rinv).replace(".", ""):
    "{}{}GeV_{:1.2f}/base_3/*.h5".format(config.signals_base_path, mass, rinv)
    for mass in masses for rinv in rinvs
}


def get_hist_data(events, event_indices, jets, errors):
Ejemplo n.º 13
0
def loadSummaries():
    print("\nRunning loadSummaries\n")
    summary = summaryProcessor.summary(summary_path)
    summaryWithOutdated = summaryProcessor.summary(summary_path=summary_path, include_outdated=True)
    return summary, summaryWithOutdated
Ejemplo n.º 14
0
def getLatestSummaryFilePath(efp_base, bottleneck_dim, version=None):
    if version is None:
        version = summaryProcessor.get_last_summary_file_version(output_path, "hlf_eflow{}_{}_".format(efp_base, bottleneck_dim))

    input_summary_path = summary_path+"/hlf_eflow{}_{}_v{}.summary".format(efp_base, bottleneck_dim, version)
    return input_summary_path
Ejemplo n.º 15
0
    i_mass = i_sample % len(masses)
    i_rinv = int(i_sample / len(masses))

    signal_name = "{}GeV_{:3.2f}".format(masses[i_mass], rinvs[i_rinv])

    return signal_name


for i in range(config.n_models):

    file_name = config.file_name

    if config.model_type == "BDT":

        signal_name = get_sigal_name_for_index(args.i_sample)
        signal_path = config.signals_base_path + "/" + signal_name + "/base_{}/*.h5".format(
            config.efp_base)
        training_setting["signal_path"] = signal_path

        file_name += "_" + signal_name

    last_version = summaryProcessor.get_last_summary_file_version(
        config.summary_path, file_name)
    file_name += "_v{}".format(last_version + 1)
    training_setting["training_output_path"] = config.results_path + file_name

    trainer = Trainer(**config.training_general_settings, **training_setting)

    trainer.train(summaries_path=config.summary_path)

    print('model {} finished'.format(i))