def __init__(self, run_conf): # configurations self.loc = run_conf.loc self.run_conf = run_conf self.opt_conf = run_conf.get('Optimiser') self.trn_conf = run_conf.get('Training') self.bcm_conf = run_conf.get('Benchmark') self.plt_conf = run_conf.get('Plotting') # models clf_type = run_conf.get('Classifier', 'type') adv_type = run_conf.get('Adversary', 'type') njet = self.trn_conf['njet'] rmd = self.trn_conf['n_rmd'] clf_name = '{}_{}_{}'.format(clf_type, njet, rmd) adv_name = '{}_{}_{}'.format(adv_type, njet, rmd) self.clf = models.Classifier.create(clf_name, run_conf.get('Classifier')) self.adv = models.Adversary.create(adv_name, run_conf.get('Adversary')) print('------------') print('--- Settings:') print(self.clf) print(self.adv) print('Optimisation', self.opt_conf) print('Training', self.trn_conf) print('Benchmark', self.bcm_conf) print('Plotting', self.plt_conf) print('------------') # load the data handler and the data self.njet = self.trn_conf['njet'] self.dh = None self.bcm_features = dataset.feature_list(self.trn_conf['bcm_features'], self.njet) self.clf_features = dataset.feature_list(self.trn_conf['clf_features'], self.njet) self._ds = {} self._tt = ['train', 'test'] self._tts = ['train', 'test', 'ss'] self._load_data() # prepare losses self._losses = {tt:{n:[] for n in ['C', 'A', 'CA', 'BCM']} for tt in self._tt} # prepare metrics self.metrics = ['roc_auc', 'sig_eff', 'ks_metric'] self._metric_vals = {met:{tt:[] for tt in self._tt} for met in self.metrics} # prepare classifier scores self.bcm = None self.bcm_score = {} self.clf_score = {} self.score_loc = utils.makedir(os.path.join(self.loc, 'clf_scores'.format(self.clf.name)))
def make_new_runconf_dir(self): for i in itertools.count(): # does this run dir already exist? trial_path = os.path.join(self.loc, 'points', 'run{:04d}'.format(i)) # try next one if it exists, otherwise return the path if os.path.exists(trial_path): continue else: return utils.makedir(trial_path)
def main(): # parse args parser = argparse.ArgumentParser(description='ArgParser') parser.add_argument('-s', '--sweep', default='testsweep', help='Name of the sweep.') parser.add_argument('--batch', default=False, action='store_true') args = parser.parse_args() # get the location and configuration file loc = utils.makedir(os.path.join(os.getenv('RUN'), args.sweep)) sweep_conf = configuration.Configuration( os.path.join(loc, 'sweep_conf.ini')) print('--- Running {}'.format(args.sweep)) print(sweep_conf) # submit the jobs for all the run configs for run_conf in sweep_conf: print() print('--- Running point') print() # make the command e = os.path.join(os.getenv('SRC'), 'scripts', 'run_point.py') p = run_conf.path command = 'time python3 {} -p {}'.format(e, p) # run the command (submit) if args.batch: run_nb = p.split('/')[-2] path = os.path.join( os.path.split(run_conf.path)[0], '{}.sh'.format(run_nb)) print(path) utils.write_job(command, path) utils.send_job(path) else: os.system(command) break
def main(): # parse args parser = argparse.ArgumentParser(description='ArgParser') parser.add_argument('-s', '--sweep', default='testsweep', help='Name of the sweep.') args = parser.parse_args() print('--- Summarising {}'.format(args.sweep)) # read the results as a dataframe results, options, metrics = configuration.read_results(args.sweep) # plot the results summary loc = utils.makedir(os.path.join(os.getenv('RUN'), args.sweep, 'plots')) for metric, option in itertools.product(metrics, options): plot.metric_vs_parameter(metric, option, results, loc) for option in options: plot.metric2d('sig_eff', 'roc_auc', option, results, loc) plot.metric2d('ks_metric', 'roc_auc', option, results, loc) plot.metric2d('ks_metric', 'sig_eff', option, results, loc)
def loc(name): return self.loc if is_final_step else utils.makedir(os.path.join(self.loc, name))
def main(): ##################### # Les configurables ##################### prod_name = 'default' prod_name = '{}_{}'.format(datetime.strftime(datetime.today(), '%Y%m%d'), prod_name) # other settings do_steps = [1, 2, 3, 4] step1 = 'step1' step2 = 'step2' step3 = 'step3' out = 'out' ##################### # Prepare directories and files ##################### loc = {} loc[defs.sig] = utils.makedir( '/data/atlassmallfiles/users/zgubic/hmumu/v17/hadd') loc[defs.data] = utils.makedir( '/data/atlassmallfiles/users/zgubic/hmumu/v17/hadd') loc[defs.ss] = utils.makedir( '/data/atlassmallfiles/users/zgubic/hmumu/spurious_signal') loc[out] = utils.makedir(os.path.join(os.getenv('DATA'), prod_name)) # get the file names fnames = {} fnames[defs.sig] = [ 'mc16a.345097.root', 'mc16d.345097.root', 'mc16a.345106.root', 'mc16d.345106.root', ] fnames[defs.data] = [ 'data15.allYear.sideband.root', 'data16.allYear.sideband.root', 'data17.allYear.sideband.root', ] fnames[defs.ss] = os.listdir(loc[defs.ss]) ##################### # Step 1: Mass cut + selection ##################### if 1 in do_steps: loc[step1] = utils.makedir(os.path.join(loc[out], step1)) for dataset in defs.datasets: for fname in fnames[dataset]: # input file in_file = os.path.join(loc[dataset], fname) # output file out_file = os.path.join(loc[step1], fname) # run the selection full_selection = 1 if dataset in [defs.sig, defs.data] else 0 is_signal = 1 if dataset in [defs.sig] else 0 command = "root -l -q 'selection.cxx(\"{i}\", \"{o}\", {sel}, {sig})'".format( i=in_file, o=out_file, sel=full_selection, sig=is_signal) os.system(command) ##################### # Step 2: Hadd ##################### if 2 in do_steps: loc[step2] = utils.makedir(os.path.join(loc[out], step2)) for dataset in defs.datasets: # collect input files in_files = [ os.path.join(loc[step1], fname) for fname in fnames[dataset] ] # hadded file out_file = os.path.join(loc[step2], dataset) + '.root' # hadd them command = 'hadd -f {} {}'.format(out_file, ' '.join(in_files)) os.system(command) ##################### # Step 3: Split in jet categories ##################### if 3 in do_steps: loc[step3] = utils.makedir(os.path.join(loc[out], step3)) for dataset in defs.datasets: # hadded file hadd_file = os.path.join(loc[step2], '{}.root'.format(dataset)) # split in 0, 1, 2+ jet datasets (separate trees inside) split_file = os.path.join(loc[step3], '{}.root'.format(dataset)) # run the macro to make new trees command = "root -l -q 'njet_split.cxx(\"{i}\", \"{o}\")'".format( i=hadd_file, o=split_file) os.system(command) ##################### # Step 4: Shuffle the entries ##################### if 4 in do_steps: for dataset in defs.datasets: for tree in defs.channels: # read the dataframes in_file = os.path.join(loc[step3], '{}.root'.format(dataset)) df = rpd.read_root(in_file, key=tree) # shuffle the tree df = df.sample(frac=1).reset_index(drop=True) # and write it out_file = os.path.join(loc[out], '{}.root'.format(dataset)) mode = 'w' if tree == defs.channels[0] else 'a' df.to_root(out_file, key=tree, mode=mode)