Beispiel #1
0
def prepare_data(branchinfo_file="vars.yaml", region="2j2b"):
    with open(branchinfo_file, "r") as f:
        branches = yaml.load(f, Loader=yaml.FullLoader)
    branches = branches[region]

    ttbar = from_pytables(f"/home/ddavis/ATLAS/data/h5s/ttbar_r{region}.h5",
                          label=0,
                          auxlabel=1)
    tW_DR = from_pytables(f"/home/ddavis/ATLAS/data/h5s/tW_DR_r{region}.h5",
                          label=1,
                          auxlabel=1)
    tW_DS = from_pytables(f"/home/ddavis/ATLAS/data/h5s/tW_DS_r{region}.h5",
                          label=1,
                          auxlabel=0)

    tW_DR.keep_columns(branches)
    ttbar.keep_columns(branches)
    tW_DS.keep_columns(branches)
    scale_weight_sum(tW_DS, ttbar)
    scale_weight_sum(tW_DR, ttbar)
    tW_DR.weights *= 50
    tW_DS.weights *= 50
    ttbar.weights *= 100

    X = pd.concat([ttbar.df, tW_DR.df, tW_DS.df]).to_numpy()
    w = np.concatenate([ttbar.weights, tW_DR.weights, tW_DS.weights])
    y = np.concatenate(
        [ttbar.label_asarray, tW_DR.label_asarray, tW_DS.label_asarray])
    z = np.concatenate([
        ttbar.auxlabel_asarray, tW_DR.auxlabel_asarray, tW_DS.auxlabel_asarray
    ])

    return (X, y, w, z)
Beispiel #2
0
def get_combined():
    ttbar = from_pytables("ttbar.h5", "ttbar", label=0)
    tW_DR = from_pytables("tW_DR.h5", "tW_DR", label=0)
    tW_DS = from_pytables("tW_DS.h5", "tW_DS", label=1)
    scale_weight_sum(tW_DR, ttbar)
    scale_weight_sum(tW_DS, ttbar)
    tW_DR.weights *= 0.5
    tW_DS.weights *= 0.5
    print(ttbar.weights.sum(), tW_DR.weights.sum(), tW_DS.weights.sum())
    return ttbar, tW_DR, tW_DS
Beispiel #3
0
def test_scale_weight_sum():
    ds1 = from_root(["tests/data/test_file.root"],
                    name="myds",
                    branches=branches)
    ds2 = from_root(["tests/data/test_file.root"],
                    name="ds2",
                    branches=branches)
    ds2.weights = np.random.randn(len(ds1)) * 10
    scale_weight_sum(ds1, ds2)
    testval = abs(1.0 - ds2.weights.sum() / ds1.weights.sum())
    assert testval < 1.0e-4
Beispiel #4
0
def prepare_data(region="3jHb", delete_datasets=True):

    ttbar = from_pytables(
        f"/Users/ddavis/Desktop/newfullkincomb/ttbar_r{region}.h5",
        label=0,
        auxlabel=1)
    tW_DR = from_pytables(
        f"/Users/ddavis/Desktop/newfullkincomb/tW_DR_r{region}.h5",
        label=1,
        auxlabel=1)
    # tW_DS = from_pytables(f"/home/ddavis/ATLAS/data/h5s/tW_DS_r{region}.h5", label=1, auxlabel=0)

    # tW_DR.keep_columns(branches)
    # ttbar.keep_columns(branches)
    # tW_DS.keep_columns(branches)
    # scale_weight_sum(tW_DS, ttbar)
    scale_weight_sum(tW_DR, ttbar)
    tW_DR.weights *= 100
    # tW_DS.weights *= 50
    ttbar.weights *= 100

    X = pd.concat([ttbar.df, tW_DR.df]).to_numpy()  # , tW_DS.df]).to_numpy()
    w = np.concatenate([ttbar.weights, tW_DR.weights])  # , tW_DS.weights])
    y = np.concatenate([ttbar.label_asarray(),
                        tW_DR.label_asarray()])  # , tW_DS.label_asarray()])
    z = np.concatenate([ttbar.auxlabel_asarray(),
                        tW_DR.auxlabel_asarray()
                        ]  # , tW_DS.auxlabel_asarray()]
                       )

    if delete_datasets:
        del ttbar
        del tW_DR

    print("returning data")
    return (X, y, w, z)
Beispiel #5
0
    def __init__(
        self,
        name='zero_jet',
        base_directory='../h5',
        signal_h5='sig_one_jet.h5',
        signal_name='sig',
        signal_tree='wt_DR_nominal',
        signal_latex=r'H$\rightarrow\mu\mu$',
        backgd_h5='bkg_zero_jet.h5',
        backgd_name='bkg',
        backgd_tree='tt_nominal',
        backgd_latex=r'Data sideband',
        weight_name='weight',
        variables=['Z_PT_FSR', 'Z_Y_FSR', 'Muons_CosThetaStar', 'm_mumu'],
        has_syst=False,
        syssig_h5='tW_DS_2j2b.h5',
        syssig_name='tW_DS_2j2b',
        syssig_tree='tW_DS',
        syssig_latex=r'$tW$ DS',
        has_mass=True,
        reg_variable='m_mumu',
        reg_latex=r'm_\mu\mu',
    ):
        self.name = name
        self.signal_label, self.backgd_label, self.center_label, self.syssig_label = 1, 0, 1, 0
        self.signal_latex, self.backgd_latex = signal_latex, backgd_latex
        self.signal = from_pytables(signal_h5,
                                    signal_name,
                                    tree_name=signal_tree,
                                    weight_name=weight_name,
                                    label=self.signal_label,
                                    auxlabel=self.center_label)
        self.backgd = from_pytables(backgd_h5,
                                    backgd_name,
                                    tree_name=backgd_tree,
                                    weight_name=weight_name,
                                    label=self.backgd_label,
                                    auxlabel=self.center_label)
        self.signal.keep_columns(variables)
        self.backgd.keep_columns(variables)
        self.has_syst = has_syst
        self.syssig_latex = None if not self.has_syst else syssig_latex
        self.losses_test = {'L_gen': [], 'L_dis': [], 'L_diff': []}
        self.losses_train = {'L_gen': [], 'L_dis': [], 'L_diff': []}
        self.has_mass = has_mass
        self.reg_variable = reg_variable
        self.reg_latex = reg_latex

        if self.has_syst:
            self.syssig = from_pytables(syssig_h5,
                                        syssig_name,
                                        tree_name=syssig_tree,
                                        weight_name=weight_name,
                                        label=self.signal_label,
                                        auxlabel=self.syssig_label)
            self.syssig.keep_columns(variables)

            # Append syssig to signal
            self.signal.append(self.syssig)

        # Equalise signal weights to background weights
        scale_weight_sum(self.signal, self.backgd)
        self.X_raw = np.concatenate(
            [self.signal.df.to_numpy(),
             self.backgd.df.to_numpy()])
        scaler = StandardScaler()
        self.X = scaler.fit_transform(self.X_raw)
        self.y = np.concatenate(
            [self.signal.label_asarray(),
             self.backgd.label_asarray()])
        self.z = np.concatenate(
            [self.signal.auxlabel_asarray(),
             self.backgd.auxlabel_asarray()])
        if self.has_mass:
            signal = from_pytables(signal_h5,
                                   signal_name,
                                   tree_name=signal_tree,
                                   weight_name=weight_name,
                                   label=self.signal_label,
                                   auxlabel=self.center_label)
            backgd = from_pytables(backgd_h5,
                                   backgd_name,
                                   tree_name=backgd_tree,
                                   weight_name=weight_name,
                                   label=self.backgd_label,
                                   auxlabel=self.center_label)
            signal.keep_columns([reg_variable])
            backgd.keep_columns([reg_variable])

            def normalise(df):
                df[df > 200] = 200
                return (df - 110) / 70.

            self.z = np.concatenate([
                normalise(signal.df).to_numpy(),
                normalise(backgd.df).to_numpy()
            ])
        self.w = np.concatenate([self.signal.weights, self.backgd.weights])

        self.output_path = '/'.join([base_directory, self.describe()]) + '/'
        if not os.path.exists(self.output_path):
            os.makedirs(self.output_path)
        print('\033[92m[INFO]\033[0m', self.describe(),
              self.signal.df.__getitem__, self.backgd.df.__getitem__)
        print('\033[92m[INFO]\033[0m', '-' * 20)

        #store the content
        with open(self.output_path + self.name + '_event.pkl', 'wb') as pkl:
            pickle.dump(scaler, pkl)
        #store the content
        with open(self.output_path + self.name + '_event_py2.pkl',
                  'wb') as pkl:
            pickle.dump(scaler, pkl, protocol=2)
Beispiel #6
0
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from twaml.data import from_pytables
from twaml.data import scale_weight_sum
import matplotlib.pyplot as plt

ttbar = from_pytables("ttbar_1j1b.h5", "ttbar", label=0)
tW_DR = from_pytables("tW_DR_1j1b.h5", "tW_DR", label=1)
sow = ttbar.weights.sum() + tW_DR.weights.sum()
mwfl = sow * 0.01
scale_weight_sum(tW_DR, ttbar)

y = np.concatenate([tW_DR.label_asarray, ttbar.label_asarray])
X = np.concatenate([tW_DR.df.to_numpy(), ttbar.df.to_numpy()])
w = np.concatenate([tW_DR.weights, ttbar.weights])

folder = KFold(n_splits=3, shuffle=True, random_state=414)

ttbar_dist = []
tW_dist = []
tW_w_dist = []
ttbar_w_dist = []
roc_aucs = []
for train_idx, test_idx in folder.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    w_train, w_test = w[train_idx], w[test_idx]

    param = {"max_depth": 4, "n_estimators": 150, "min_child_weight": mwfl}