コード例 #1
0
def test_concatenate_awkward():
    awkward = pytest.importorskip("awkward")
    files = skhep_testdata.data_path(
        "uproot-sample-6.20.04-uncompressed.root").replace("6.20.04", "*")
    arrays = uproot.concatenate({files: "sample"}, ["i8", "f8"], library="ak")
    assert isinstance(arrays, awkward.Array)
    assert set(awkward.fields(arrays)) == set(["i8", "f8"])
    assert len(arrays) == 420
コード例 #2
0
def test_concatenate_pandas():
    pandas = pytest.importorskip("pandas")
    files = skhep_testdata.data_path(
        "uproot-sample-6.20.04-uncompressed.root").replace("6.20.04", "*")
    arrays = uproot.concatenate({files: "sample"}, ["i8", "f8"], library="pd")
    assert isinstance(arrays, pandas.DataFrame)
    assert set(arrays.columns.tolist()) == set(["i8", "f8"])
    assert len(arrays) == 420
コード例 #3
0
    def pd_tree(self, path, tname, squery=None):
        try:
            tree = uproot.open(path)[tname]
        except:
            pwarning('error getting', tname, 'from file:', path)
            return None
        if not tree:
            perror('Tree {} not found in file {}'.format(tname, path))
            return None
        df = uproot.concatenate(tree, library="pd")

        if squery:
            #df.query(squery, inplace=True)
            df = df.query(squery)
            df.reset_index(drop=True)
        return df
コード例 #4
0
    def load_dataframe(self):

        # Load event tree into dataframe
        if not self.skip_event_tree:
            event_tree = None
            event_df = None
            event_tree_name = self.tree_dir + self.event_tree_name
            with uproot.open(self.input_file)[event_tree_name] as event_tree:
                if not event_tree:
                    raise ValueError("Tree %s not found in file %s" %
                                     (event_tree_name, self.input_file))
                self.event_df_orig = uproot.concatenate(event_tree,
                                                        self.event_columns,
                                                        library="pd")

            # Check if there are duplicated event ids
            #print(self.event_df_orig)
            #d = self.event_df_orig.duplicated(self.unique_identifier, keep=False)
            #print(self.event_df_orig[d])
            n_duplicates = sum(
                self.event_df_orig.duplicated(self.unique_identifier))
            if n_duplicates > 0:
                raise ValueError(
                    "There appear to be %i duplicate events in the event dataframe"
                    % n_duplicates)

            # Apply event selection
            self.event_df_orig.reset_index(drop=True)
            if self.is_pp:
                event_criteria = 'is_ev_rej == 0'
            else:
                event_criteria = 'is_ev_rej == 0 and centrality > @self.min_centrality and centrality < @self.max_centrality'
            if self.event_plane_range:
                event_criteria += ' and event_plane_angle > @self.event_plane_range[0] and event_plane_angle < @self.event_plane_range[1]'
            event_df = self.event_df_orig.query(event_criteria)
            event_df.reset_index(drop=True)

        # Load track tree into dataframe
        track_tree = None
        track_df_orig = None
        track_tree_name = self.tree_dir + self.track_tree_name
        with uproot.open(self.input_file)[track_tree_name] as track_tree:
            if not track_tree:
                raise ValueError("Tree %s not found in file %s" %
                                 (track_tree_name, self.input_file))
            track_df_orig = uproot.concatenate(track_tree,
                                               self.track_columns,
                                               library="pd")

        # Apply hole selection, in case of jetscape
        if self.is_jetscape:
            if self.holes:
                track_criteria = 'status == -1'
            else:
                track_criteria = 'status == 0'
            track_df_orig = track_df_orig.query(track_criteria)
            track_df_orig.reset_index(drop=True)

        # Check if there are duplicated tracks
        #print(track_df_orig)
        #d = track_df_orig.duplicated(self.track_columns, keep=False)
        #print(track_df_orig[d])
        n_duplicates = sum(track_df_orig.duplicated(self.track_columns))
        if n_duplicates > 0:
            raise ValueError(
                "There appear to be %i duplicate particles in the track dataframe"
                % n_duplicates)

        # Merge event info into track tree
        if self.skip_event_tree:
            self.track_df = track_df_orig
        else:
            self.track_df = pandas.merge(track_df_orig,
                                         event_df,
                                         on=self.unique_identifier)

        # Check if there are duplicated tracks in the merge dataframe
        #print(self.track_df)
        #d = self.track_df.duplicated(self.track_columns, keep=False)
        #print(self.track_df[d])
        n_duplicates = sum(self.track_df.duplicated(self.track_columns))
        if n_duplicates > 0:
            sys.exit(
                'ERROR: There appear to be {} duplicate particles in the merged dataframe'
                .format(n_duplicates))

        return self.track_df
コード例 #5
0
def main(args):
    logger = setup_logging()

    v0_input_dir = args.v0_input_dir
    vcustom_input_dir = args.vcustom_input_dir
    output_dir = args.output_dir
    channel = args.channel

    logger.info("Fit to Double Crystal Ball")

    tree_name = tree_name_tmpl.format(channel)

    final_plots_specs = {}

    # Needed names for files and trees
    file_dirs = {"Vertex 0th": v0_input_dir, "Vertex Reco": vcustom_input_dir}

    fit_colors = {"Vertex 0th": "kRed", "Vertex Reco": "kBlue"}

    # Create sigma_m_over_m categories
    logger.info("Creating categories of SigmaMOverM")
    file_format = {
        "Vertex 0th": v0_input_dir + "/" + file_names_tmpl[channel],
        "Vertex Reco": vcustom_input_dir + "/" + file_names_tmpl[channel]
    }

    categories = {}
    smom = "sigma_m"  # due to how we defined it in flashgg, it's already divided by M
    for vtx_name, direc in file_format.items():
        categories[vtx_name] = {}
        final_plots_specs[vtx_name] = {}

        arr = uproot.concatenate(["{}:{}".format(direc, tree_name)],
                                 expressions=[smom],
                                 library="ak")
        arr = np.asarray([ev[0] for ev in arr.to_numpy()])

        cut_format = "{var} > {min_edge} && {var} < {max_edge}"
        edge_min = 0.
        edge_max = 0.035
        n_bins = 5
        edges = get_edges(arr, edge_min, edge_max, n_bins)

        low = edges[0]
        for high in edges[1:]:
            cat_name = "SigmaMOverM_{:.5f}-{:.5f}".format(low, high)
            cat_string = cut_format.format(var=smom,
                                           min_edge=low,
                                           max_edge=high)
            categories[vtx_name][cat_name] = cat_string

            final_plots_specs[vtx_name][cat_name] = {}
            final_plots_specs[vtx_name][cat_name]["range"] = (low, high)

            low = high

    logger.info("Created categories {}".format(categories))

    for vtx_name, direc in file_dirs.items():
        logger.info("Working with vertex {}".format(vtx_name))
        for cat_name, cut in categories[vtx_name].items():
            logger.info("Working with category {}".format(cat_name))

            chain = ROOT.TChain()
            files = [
                fl for fl in os.listdir(direc)
                if fl.startswith(file_names_tmpl[channel][:20])
            ]
            for fl in files:
                chain.Add("{}/{}/{}".format(direc, fl, tree_name))
            rdf = ROOT.RDataFrame(chain)
            rdf_cut = rdf.Filter(cut)
            mass_arr = rdf_cut.Take[float]("mass").GetValue()
            weight_arr = rdf_cut.Take[float]("weight").GetValue()
            mass_fake_arr = array("d", [0.])
            weight_fake_arr = array("d", [0.])
            cut_tree = ROOT.TTree("cut_tree", "cut_tree")
            cut_tree.Branch("mass", mass_fake_arr, "mass/D")
            cut_tree.Branch("weight", weight_fake_arr, "weight/D")
            for ev_mass, ev_weight in zip(mass_arr, weight_arr):
                mass_fake_arr[0] = ev_mass
                weight_fake_arr[0] = ev_weight
                cut_tree.Fill()

            # RooFit objects
            mass = ROOT.RooRealVar("mass", "Invariant mass [GeV]", 125, 115,
                                   135)
            weight = ROOT.RooRealVar("weight", "weight", -1, 1)

            mu = ROOT.RooRealVar("mu", "mu", 125, 120, 130)
            sigma1 = ROOT.RooRealVar("sigma1", "sigma1", 1, 0.1, 10)
            alpha1 = ROOT.RooRealVar("alpha1", "alpha1", 1, 0, 10)
            n1 = ROOT.RooRealVar("n1", "n1", 1, 0, 5)

            cb1 = ROOT.RooCBShape("cb1", "cb1", mass, mu, sigma1, alpha1, n1)

            sigma2 = ROOT.RooRealVar("sigma2", "sigma2", 4, 0.1, 10)
            alpha2 = ROOT.RooRealVar("alpha2", "alpha2", 1, 0, 10)
            n2 = ROOT.RooRealVar("n2", "n2", 1, 0, 5)

            frac = ROOT.RooRealVar("frac", "frac", 0.5, 0., 1.)

            cb2 = ROOT.RooCBShape("cb2", "cb2", mass, mu, sigma2, alpha2, n2)

            model = ROOT.RooAddPdf("model", "model", ROOT.RooArgList(cb1, cb2),
                                   ROOT.RooArgList(frac))

            # Create (weighted) dataset
            data = ROOT.RooDataSet("data".format(cat_name),
                                   "data".format(cat_name), cut_tree,
                                   ROOT.RooArgSet(mass, weight), "",
                                   weight.GetName())

            # Fit in subrange
            mass.setRange("higgs", 116, 134)
            logger.info("Performing fit")
            fit_result = fit_result = model.fitTo(
                data, ROOT.RooFit.Range("higgs"), ROOT.RooFit.Save(1),
                ROOT.RooFit.AsymptoticError(1))

            # Plot decoration
            mass_frame = mass.frame(
                ROOT.RooFit.Title("Mass-{}-{}".format(vtx_name, cat_name)))
            mass_frame.GetYaxis().SetTitleOffset(1.6)
            data.plotOn(mass_frame,
                        ROOT.RooFit.DataError(ROOT.RooAbsData.SumW2))
            model.plotOn(
                mass_frame,
                ROOT.RooFit.LineColor(getattr(ROOT, fit_colors[vtx_name])))
            chi_sq = mass_frame.chiSquare()
            model.paramOn(
                mass_frame, ROOT.RooFit.Layout(0.65),
                ROOT.RooFit.Label("chiSq / ndof = {:.5f}".format(chi_sq)))

            # Dump plots
            logger.info("Dumping plots")
            c = ROOT.TCanvas("", "")
            mass_frame.Draw()
            c.SaveAs("{}/mass_{}_{}.jpg".format(output_dir, vtx_name,
                                                cat_name))
            c.SaveAs("{}/mass_{}_{}.pdf".format(output_dir, vtx_name,
                                                cat_name))

            # Fill values for final plots
            parameters = {
                var.GetName(): var.getVal()
                for var in list(model.getParameters(data))
            }

            # See https://root-forum.cern.ch/t/how-to-calculate-effective-sigma/39472/3
            final_plots_specs[vtx_name][cat_name]["fitted_sigma"] = np.sqrt(
                    (parameters["sigma1"]**2)*parameters["frac"] \
                    + (parameters["sigma2"]**2)*(1 - parameters["frac"])
                    )
            # Propagate uncertainty on sigma effective
            # To get the covariances from fit result, remember the indexes
            cov_matrix = fit_result.covarianceMatrix()
            frac_index = 2
            sigma1_index = 6
            sigma2_index = 7

            var_frac = cov_matrix[frac_index][frac_index]
            var_v_1 = cov_matrix[sigma1_index][sigma1_index]
            var_v_2 = cov_matrix[sigma2_index][sigma2_index]

            cov_v_1_v_2 = cov_matrix[sigma1_index][sigma2_index]
            cov_v_1_frac = cov_matrix[sigma1_index][frac_index]
            cov_v_2_frac = cov_matrix[sigma2_index][frac_index]

            final_plots_specs[vtx_name][cat_name][
                "fitted_sigma_unc"] = eff_sigma_unc(
                    parameters["frac"], 1 - parameters["frac"],
                    parameters["sigma1"] - parameters["sigma2"], var_v_1,
                    var_v_2, var_frac, cov_v_1_v_2, cov_v_1_frac, cov_v_2_frac)

    logger.info(
        "Dumping final plots specifications: {}".format(final_plots_specs))

    with open("sigma_m_final_plots_specs_{}.pkl".format(channel), "wb") as fl:
        pickle.dump(final_plots_specs, fl)
コード例 #6
0
   '../../ntuples/0.9.6-2016_production/JpsiK-mc-step2/JpsiK--22_03_10--mc--12143001--2016--md.root:tree',
   '../../ntuples/0.9.6-2016_production/JpsiK-mc-step2/JpsiK--22_03_10--mc--12143001--2016--mu.root:tree',
]

histoMcRawN = 'h_occupancy_mc_raw'
histoDataRawN = 'h_occupancy_data_raw'
histoRatioN = 'h_occupancy'

mcBrsN = ['b_ownpv_ndof', 'ntracks', 'wjk_occ', 'wpid', 'wtrk']


#########################################
# Rebuild histogram from step-2 ntuples #
#########################################

mcBrsRaw = concatenate(mcNtpsN, mcBrsN, library='np')
globalCut = (mcBrsRaw['ntracks'] < 450) & (mcBrsRaw['b_ownpv_ndof'] < 200)  # Apply a global cut
mcBrs = {k: v[globalCut] for k, v in mcBrsRaw.items()}

wtResult = np.prod([mcBrs[i] for i in ['wpid', 'wtrk', 'wjk_occ']], axis=0)
hResult, *hResultBins = np.histogram2d(
    mcBrs['b_ownpv_ndof'], mcBrs['ntracks'], (20, 20), ((1, 200), (0, 450)),
    weights=wtResult)
hMc, *hMcBins = np.histogram2d(
    mcBrs['b_ownpv_ndof'], mcBrs['ntracks'], (20, 20), ((1, 200), (0, 450)),
    weights=mcBrs['wpid']*mcBrs['wtrk'])


############################
# Load existing histograms #
############################
コード例 #7
0
    def build_dataframe(
            self,
            data_path: str,
            TTree_name: str,
            tree_dict: Dict[str, Set[str]],
            is_truth: bool,
            is_reco: bool,
            chunksize: int = 1024,
            validate_missing_events: bool = True,
            validate_duplicated_events: bool = True,
            validate_sumofweights: bool = True,
    ) -> pd.DataFrame:
        """
         Builds a dataframe

        :param data_path: path to ROOT datafile(s)
        :param TTree_name: TTree in datapath to set as default tree
        :param tree_dict: dictionary of tree: variables to extract from Datapath
        :param is_truth: whether dataset contains truth data
        :param is_reco: whether dataset contains reco data
        :param chunksize: chunksize for uproot concat method
        :param validate_missing_events: whether to check for missing events
        :param validate_duplicated_events: whether to check for duplicated events
        :param validate_sumofweights: whether to check sum of weights against weight_mc
        :return: output dataframe containing columns corresponding to necessary variables
        """
        self.logger.info(f"Building DataFrame from {data_path} ({file_utils.n_files(data_path)} file(s))...")

        # is the default tree a truth tree?
        default_tree_truth = 'truth' in TTree_name

        t1 = time.time()
        self.logger.debug(f"Extracting {tree_dict[TTree_name]} from {TTree_name} tree...")
        df = to_pandas(uproot.concatenate(data_path + ':' + TTree_name, tree_dict[TTree_name],
                                          num_workers=config.n_threads, begin_chunk_size=chunksize))
        self.logger.debug(f"Extracted {len(df)} events.")

        self.logger.debug(f"Extracting ['total_EventsWeighted', 'dsid'] from 'sumWeights' tree...")
        sumw = to_pandas(uproot.concatenate(data_path + ':sumWeights', ['totalEventsWeighted', 'dsid'],
                                            num_workers=config.n_threads, begin_chunk_size=chunksize))

        self.logger.debug(f"Calculating sum of weights and merging...")
        sumw = sumw.groupby('dsid').sum()
        df = pd.merge(df, sumw, left_on='mcChannelNumber', right_on='dsid', sort=False, copy=False)

        df.set_index(['mcChannelNumber', 'eventNumber'], inplace=True)
        df.index.names = ['DSID', 'eventNumber']
        self.logger.debug("Set DSID/eventNumber as index")

        # merge TTrees
        if validate_duplicated_events:
            validation = '1:1'
            self.logger.info(f"Validating duplicated events in tree {TTree_name}...")
            self.__drop_duplicates(df)
            self.__drop_duplicate_event_numbers(df)
        else:
            validation = 'm:m'
            self.logger.info("Skipping duplicted events validation")

        # iterate over TTrees and merge
        for tree in tree_dict:
            if tree == TTree_name:
                continue

            self.logger.debug(f"Extracting {tree_dict[tree]} from {tree} tree...")
            alt_df = to_pandas(uproot.concatenate(data_path + ":" + tree, tree_dict[tree],
                                                  num_workers=config.n_threads, begin_chunk_size=chunksize))
            self.logger.debug(f"Extracted {len(alt_df)} events.")

            alt_df.set_index(['mcChannelNumber', 'eventNumber'], inplace=True)
            alt_df.index.names = ['DSID', 'eventNumber']
            self.logger.debug("Set DSID/eventNumber as index")

            if validate_missing_events:
                self.logger.info(f"Checking for missing events in tree '{tree}'..")
                tree_is_truth = 'truth' in tree

                if tree_is_truth and not default_tree_truth:
                    if n_missing := len(df.index.difference(alt_df.index)):
                        raise Exception(
                            f"Found {n_missing} events in '{TTree_name}' tree not found in '{tree}' tree")
                    else:
                        self.logger.debug(f"All events in {TTree_name} tree found in {tree} tree")
                elif default_tree_truth and not tree_is_truth:
                    if n_missing := len(alt_df.index.difference(df.index)):
                        raise Exception(
                            f"Found {n_missing} events in '{tree}' tree not found in '{TTree_name}' tree")
                    else:
                        self.logger.debug(f"All events in {tree} tree found in {TTree_name} tree")
                else:
                    self.logger.info(f"Skipping missing events check. Not truth/reco tree combination")
コード例 #8
0
    print('Compute errors...')
    result.errors(method='minuit_minos')

    print(
        f'Fit & error computation took a total of {time.time() - time_start:.2f} sec.'
    )
    return result, nll


if __name__ == '__main__':
    mplhep.style.use('LHCb2')
    args = parse_input()
    fit_params = load_params(args.params)

    ntp_brs = concatenate(args.input, [args.branch] + args.extraBranches,
                          library='np')
    fit_var = ntp_brs[args.branch]
    print(f'Total events in data: {fit_var.size}')

    print('Initialize fit model...')
    obs = zfit.Space('x', limits=MODEL_BDY)
    fit_model, fit_components, _ = fit_model_overall(obs, fit_var, fit_params)

    output_plot_init = args.output + '/fit_init.pdf' \
        if not args.outputPlotInit else args.outputPlotInit
    ensure_dir(output_plot_init)
    # Always plot the initial condition
    plot(fit_var,
         fit_components,
         output=output_plot_init,
         data_range=MODEL_BDY,
コード例 #9
0
    bin_centres = np.zeros(len(bin_edges) - 1)
    for i in range(1, len(bin_edges)):
        bin_centres[i -
                    1] = bin_edges[i] + (bin_edges[i] - bin_edges[i - 1]) / 2
    return bin_centres


if __name__ == "__main__":
    ntuple_dir = "E:\\NTuples\\TauClassifier"
    sig_files = glob.glob(f"{ntuple_dir}\\*Gammatautau*\\*.root")
    bkg_files = glob.glob(f"{ntuple_dir}\\*JZ*\\*.root")

    cuts = "(TauJets.jet_pt > 15000.0) & (TauJets.jet_pt < 10000000.0)"

    sig_pt = uproot.concatenate(sig_files,
                                filter_name="TauJets.jet_pt",
                                cut=cuts,
                                library='np')

    bkg_pt = uproot.concatenate(bkg_files,
                                filter_name="TauJets.jet_pt",
                                cut=cuts,
                                library='np')

    bkg_pt = bkg_pt["TauJets.jet_pt"]
    bkg_pt = np.sort(bkg_pt) / 1e6

    sig_pt = sig_pt["TauJets.jet_pt"]
    sig_pt = np.sort(sig_pt) / 1e6

    # Binning
    bin_edges = np.percentile(bkg_pt, np.linspace(0.0, 100.0, 50))
コード例 #10
0
dataNtps = '../../run2-JpsiK/fit/fit_results/JpsiK-22_02_26_23_52-std-fit-2016/fit.root:tree'

#########
# Plots #
#########

varsToComp = ['b_ownpv_ndof', 'ntracks', 'b_pt', 'b_eta']
weightBrs = ['wpid', 'wtrk', 'w', 'wjk_kin', 'wjk_occ']
sweightBrs = ['sw_sig']

varsLabels = [r'$B$ PV NDOF', r'nTracks', r'$B$ $p_T$ [MeV]', r'$B$ $\eta$']
dataRanges = [[1, 200], [0, 450], [0, 25e3], [2, 5]]
binnings = [20, 20, 20, 9]

dataBrs = uproot.concatenate(dataNtps, varsToComp + sweightBrs, library='np')
mcBrs = uproot.concatenate(mcNtps, varsToComp + weightBrs, library='np')

# Make numpy histogram consistent w/ ROOT's
globalCut = lambda brs: (brs['ntracks'] < 450) & (brs['b_ownpv_ndof'] < 200) & \
    (brs['b_pt'] < 25e3) & (brs['b_eta'] < 5)
dataCut = globalCut(dataBrs)
mcCut = globalCut(mcBrs)


def plot(output, br, xLabel, dataRange, bins, ratios=False):
    suf = ' MeV' if br == 'b_pt' else ''
    yLabel = f'Norm. / {(dataRange[1]-dataRange[0])/bins:.1f}{suf}'

    topPlotters = []
    botPlotters = []
コード例 #11
0
#!/usr/bin/env python

import uproot
import numpy as np

# b_pt, b_eta
BINNING = ([20, 9], [[0, 25e3], [2, 5]])
NTPS = '../../ntuples/0.9.6-2016_production/Dst_D0-mc-tracker_only/Dst_D0--22_02_24--mc--tracker_only--MC_2016_Beam6500GeV-2016-MagDown-TrackerOnly-Nu1.6-25ns-Pythia8_Sim09k_Reco16_Filtered_12773410_D0TAUNU.SAFESTRIPTRIG.DST/*-dv.root:TupleBminus/DecayTree'


branches = uproot.concatenate(
    NTPS, ['b_PT', 'b_P', 'b_PZ'], library='np')

brPT, brP, brPZ = branches['b_PT'], branches['b_P'], branches['b_PZ']
brETA = 0.5 * np.log((brP + brPZ) / (brP - brPZ))

histo = np.histogram2d(brPT, brETA, *BINNING)

ntpOut = uproot.recreate('histo.root')
ntpOut['histo'] = histo
コード例 #12
0
    def __init__(self,
                 data_type,
                 files,
                 class_label,
                 nbatches,
                 variables_dict,
                 dummy_var="truthProng",
                 cuts=None,
                 batch_size=None,
                 label="Dataloader"):
        """
        Class constructor - fills in meta-data for the data type
        :param data_type: The type of data file being loaded e.g. Gammatautau, JZ1, ect...
        :param files: A list of files of the same data type to be loaded
        :param class_label: 1 for signal, 0 for background
        :param variables_dict: dictionary of variables to load
        :param nbatches: number of batches to *roughly* split the data into
        :param dummy_var: A variable to be loaded from the file to be loaded and iterated through to work out the number
        of events in the data files.
        :param cuts: A string which can be parsed by uproot's cut option e.g. "(pt1 > 50) & ((E1>100) | (E1<90))"
        :param batch_size: Allows you to manually set the batch size for the data. This will override the automatically
        calculated batch size inferred from nbatches
        """
        self._data_type = data_type
        self.label = label
        self.files = files
        self.dummy_var = dummy_var
        self.cut = cuts
        self._nbatches = nbatches
        self.class_label = class_label
        self._variables_dict = variables_dict
        self._current_index = 0

        # Parse variables
        self._variables_list = []
        for _, variable_list in variables_dict.items():
            self._variables_list += variable_list

        # Work out how many events there in the sample
        test_arr = uproot.concatenate(self.files,
                                      filter_name="TauJets." + self.dummy_var,
                                      step_size=10000,
                                      cut=self.cut,
                                      library='np')
        self._num_events = len(test_arr["TauJets." + self.dummy_var])

        # Set the DataLoader's batch size
        if batch_size is None:
            self.specific_batch_size = math.ceil(self._num_events / nbatches)
        else:
            self.specific_batch_size = batch_size

        # Setup the iterator
        self._batches_generator = uproot.iterate(
            self.files,
            filter_name=self._variables_list,
            cut=self.cut,
            step_size=self.specific_batch_size)

        # Work out the number of batches there are in the generator
        self._num_real_batches = 0
        for _ in uproot.iterate(self.files,
                                filter_name=self._variables_list[0],
                                cut=self.cut,
                                step_size=self.specific_batch_size):
            self._num_real_batches += 1

        logger.log(f"Found {len(files)} files for {data_type}", 'INFO')
        logger.log(f"Found these files: {files}", 'INFO')
        logger.log(f"Found {self._num_events} events for {data_type}", 'INFO')
        logger.log(
            f"Number of batches in {self.label} {self.data_type()} = {self._num_real_batches}",
            'DEBUG')
        logger.log(f"DataLoader for {data_type} initialized", "INFO")
コード例 #13
0
    'MC_WZneutrino_pt_born',
    'MC_WZmu_el_phi_born',
    'MC_WZneutrino_phi_born',
    'MC_WZ_dilep_m_born',
    'mcChannelNumber',
    'weight_mc',
    'KFactor_weight_truth',
    'weight_pileup',
    'eventNumber',
]

# pull root data
t = time.time()
nominal_df = to_pandas(
    uproot.concatenate(DATAFILE + ':nominal_Loose',
                       BRANCHES_NOMINAL,
                       num_workers=N_THREADS))
print(f"Importing nominal from ROOT: {time.time() - t:.3f}s")
nominal_df.to_pickle(OUT_DIR + 'nominal_wtaunu.pkl')

before_len = len(nominal_df.index)
print(f"number of events in nominal: {before_len}")
t = time.time()
nominal_df.drop_duplicates(inplace=True)
print(
    f"Dropped {len(nominal_df) - before_len} duplicate events: {time.time() - t:.3f}s"
)
before_len = len(nominal_df.index)
t = time.time()
nominal_df.drop_duplicates(['eventNumber', 'mcChannelNumber'], inplace=True)
print(
コード例 #14
0
def main(args):
    logger = setup_logging()

    v0_input_dir = args.v0_input_dir
    vcustom_input_dir = args.vcustom_input_dir
    output_dir = args.output_dir
    channel = args.channel

    tree_name = tree_name_tmpl.format(channel)

    plots_specs = {}

    # Needed names for files and trees
    file_dirs = {"Vertex 0th": v0_input_dir, "Vertex Reco": vcustom_input_dir}

    # Create sigma_m_over_m categories
    logger.info("Creating categories of SigmaMOverM")
    file_format = {
        "Vertex 0th": v0_input_dir + "/" + file_names_tmpl[channel],
        "Vertex Reco": vcustom_input_dir + "/" + file_names_tmpl[channel]
    }

    categories = {}
    smom = "sigma_m"  # due to how we defined it in flashgg, it's already divided by M
    for vtx_name, direc in file_format.items():
        categories[vtx_name] = {}
        plots_specs[vtx_name] = {}

        arr = uproot.concatenate(["{}:{}".format(direc, tree_name)],
                                 expressions=[smom],
                                 library="ak")
        arr = np.asarray([ev[0] for ev in arr.to_numpy()])

        cut_format = "({var} > {min_edge}) & ({var} < {max_edge})"
        edge_min = 0.
        edge_max = 0.035
        n_bins = 5
        edges = get_edges(arr, edge_min, edge_max, n_bins)

        low = edges[0]
        for high in edges[1:]:
            cat_name = "SigmaMOverM_{:.5f}-{:.5f}".format(low, high)
            cat_string = cut_format.format(var=smom,
                                           min_edge=low,
                                           max_edge=high)
            categories[vtx_name][cat_name] = cat_string

            plots_specs[vtx_name][cat_name] = {}
            plots_specs[vtx_name][cat_name]["range"] = (low, high)

            low = high

    logger.info("Created categories {}".format(categories))

    for vtx_name, direc in file_dirs.items():
        logger.info("Working with vertex {}".format(vtx_name))
        for cat_name, cut in categories[vtx_name].items():
            logger.info("Working with category {}".format(cat_name))

            files = [
                fl for fl in os.listdir(direc)
                if fl.startswith(file_names_tmpl[channel][:20])
            ]

            events = uproot.concatenate(
                [direc + "/" + fl + ":" + tree_name for fl in files],
                ["mass", "weight"],
                cut,
                library="np")

            mass = events["mass"]
            plots_specs[vtx_name][cat_name][
                "sigma_effective"] = sigma_effective(mass)

    # Plot
    x_v0 = [
        cat_spec["range"][0] +
        abs(cat_spec["range"][1] - cat_spec["range"][0]) / 2
        for cat_spec in plots_specs["Vertex 0th"].values()
    ]
    x_vcustom = [
        cat_spec["range"][0] +
        abs(cat_spec["range"][1] - cat_spec["range"][0]) / 2
        for cat_spec in plots_specs["Vertex Reco"].values()
    ]
    x_s = {"Vertex 0th": x_v0, "Vertex Reco": x_vcustom}
    fmts = {"Vertex 0th": "r^", "Vertex Reco": "sb"}

    fig, (ax, rax) = plt.subplots(nrows=2,
                                  ncols=1,
                                  gridspec_kw={"height_ratios": (3, 1)},
                                  sharex=True)

    fig.suptitle(channel)

    for vtx_name, cat_specs in plots_specs.items():
        ax.plot(x_s[vtx_name], [
            cat_spec["sigma_effective"]
            for cat_spec in plots_specs[vtx_name].values()
        ],
                fmts[vtx_name],
                label=vtx_name)

    rax_y = [
        rel_diff(s0, sc) for s0, sc in zip([
            plots_specs["Vertex 0th"][cat]["sigma_effective"]
            for cat in list(categories["Vertex 0th"].keys())
        ], [
            plots_specs["Vertex Reco"][cat]["sigma_effective"]
            for cat in list(categories["Vertex Reco"].keys())
        ])
    ]

    logger.info("Relative differences: {}".format(rax_y))

    rax.plot(x_s["Vertex 0th"], rax_y, "ko")

    for x in [ax, rax]:
        for cat in plots_specs["Vertex 0th"].values():
            low = cat["range"][0]
            x.axvline(low, color="black", alpha=0.4)

    rax.set_xlabel("$\sigma_M / M$")
    ax.set_ylabel("$\sigma_{effective}$")
    rax.set_ylabel("$rel\ diff$")
    ax.set_xlim(0.)
    ax.set_ylim(0.)
    rax.set_ylim(-0.01, 0.2)
    ax.legend(loc="upper left")
    ax.grid(which="both")
    rax.grid(which="both")

    logger.info("Dumping plot in {}".format(output_dir))
    hep.cms.label(loc=0,
                  data=True,
                  llabel="Work in Progress",
                  rlabel="",
                  ax=ax,
                  pad=.05)
    fig.savefig("{}/sigma_effective.png".format(output_dir),
                bbox_inches='tight')
    fig.savefig("{}/sigma_effective.pdf".format(output_dir),
                bbox_inches='tight')
コード例 #15
0
wtJkOccRoot = mcRootBrs['wjk_occ']
wtJkOccAltRoot = mcRootBrs['wjk_alt']

histoRootMdl = TH2DModel(
    'histoRoot', 'histoRoot',
    20, 1, 200, 20, 0, 450
)
histoRoot = df.Histo2D(histoRootMdl, 'b_ownpv_ndof', 'ntracks', 'wt')


##################
# Histo w/ numpy #
##################

mcNumpyBrsN = ['b_ownpv_ndof', 'ntracks', 'wpid', 'wtrk', 'wjk_occ']
mcNumpyBrs = uproot.concatenate(f'{mcNtpN}:{mcTreeN}', mcNumpyBrsN, library='np')


def getWeights(branches, histoRaw):
    histo, *binSpecs = histoRaw
    histoPadded = np.pad(histo, tuple((1, 1) for _ in range(histo.ndim)))
    binIdx = tuple(np.digitize(br, spec)
                   for br, spec in zip(branches, binSpecs))

    return histoPadded[binIdx]

histoWtNp = uproot.open(histoNtpN)[histoN].to_numpy()

wtJkOccNp = getWeights(
    (mcNumpyBrs['b_ownpv_ndof'], mcNumpyBrs['ntracks']), histoWtNp)
histoNumpy, *_ = np.histogram2d(
コード例 #16
0
def load_brs(ntp, tree, add_brs=None):
    br_names = [] if not add_brs else deepcopy(add_brs)
    for r in REWEIGHT_PROCEDURE.values():
        br_names += r.vars

    return concatenate([f'{i}:{tree}' for i in ntp], br_names, library='np')
コード例 #17
0
def test_concatenate_numpy():
    files = skhep_testdata.data_path(
        "uproot-sample-6.20.04-uncompressed.root").replace("6.20.04", "*")
    arrays = uproot.concatenate({files: "sample"}, ["i8", "f8"], library="np")
    assert len(arrays["i8"]) == 420
    assert len(arrays["f8"]) == 420
コード例 #18
0
def test_concatenate():
    with pytest.raises(ValueError):
        uproot.concatenate(skhep_testdata.data_path("uproot-issue63.root"))

    assert (len(
        uproot.concatenate(
            {skhep_testdata.data_path("uproot-issue63.root"): "blah"},
            allow_missing=True,
        )) == 0)

    files = skhep_testdata.data_path(
        "uproot-sample-6.16.00-uncompressed.root").replace("6.16.00", "*")

    uproot.concatenate(files, "Ai8")
    uproot.concatenate({files: "sample"}, "Ai8")
    uproot.concatenate([files], "Ai8")
    uproot.concatenate([{files: "sample"}], "Ai8")
コード例 #19
0
def main(args):

    # Read nano, micro, EB or EE cuts
    nanoaod_arr = ak.from_parquet(args.nano_input_dir)
    print("Read nanoaod: {}".format(nanoaod_arr.type))
    
    microaod_arr = uproot.concatenate(
        ["{}/*.root:diphotonDumper/trees/ggH_125_13TeV_All_$SYST".format(args.micro_input_dir)]
        )
    print("Read microaod: {}".format(microaod_arr.type))
    # Stupid typo in flashgg
    if "lead_ch_iso_worst__uncorr" in microaod_arr.fields:
        microaod_arr["lead_ch_iso_worst_uncorr"] = microaod_arr["lead_ch_iso_worst__uncorr"]

    if args.sd == "EB":
        nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.lead_eta) < 1.5]
        nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.sublead_eta) < 1.5]
        microaod_arr = microaod_arr[np.abs(microaod_arr.lead_eta) < 1.5]
        microaod_arr = microaod_arr[np.abs(microaod_arr.sublead_eta) < 1.5]

    if args.sd == "EE":
        nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.lead_eta) > 1.5]
        nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.sublead_eta) > 1.5]
        microaod_arr = microaod_arr[np.abs(microaod_arr.lead_eta) > 1.5]
        microaod_arr = microaod_arr[np.abs(microaod_arr.sublead_eta) > 1.5]

    # Read catalogue of variables to be plotted
    with open("plots_specs.json", "r") as f:
        columns = json.load(f)

    # Create dict where keys are names of variables in nano and values are names of variables in micro
    nano_micro_names = {var["nano_col"]: var["micro_col"] for var in columns}
    nano_micro_names["event"] = "event"
    nano_micro_names["lumi"] = "lumi"

    # Event by event
    nano_dict = {k: nanoaod_arr[k] for k in nano_micro_names.keys()}
    nano_dict["lead_fixedGridRhoAll"] = nanoaod_arr["lead_fixedGridRhoAll"] # needed for XGBoost vs TMVA
    test_nano = ak.Array(nano_dict)

    test_micro = microaod_arr[nano_micro_names.values()]

    pd_nano = ak.to_pandas(test_nano)
    pd_micro = ak.to_pandas(test_micro)

    pd_nano = pd_nano.set_index(["event", "lumi"])
    pd_micro = pd_micro.set_index(["event", "lumi"])

    pd_joined = pd_nano.join(pd_micro, lsuffix="_nano", rsuffix="_micro")

    print("Joined dataframe:\n{}".format(pd_joined))

    #Remove NaN values
    for nano_name, micro_name in nano_micro_names.items():
        if nano_name in ["event", "lumi"]:
            break
        if nano_name == micro_name:
            nano_name += "_nano"
            micro_name += "_micro"
        pd_joined = pd_joined[pd_joined[nano_name].notna()]
        pd_joined = pd_joined[pd_joined[micro_name].notna()]

    # Cut over delta R
    # Here https://github.com/CoffeaTeam/coffea/blob/3db3fab23064c70d0ca63b185d51c7fa3b7849dc/coffea/nanoevents/methods/vector.py#L74
    # useful info
    deltaR_threshold = 0.1

    four_lead_nano = vector.obj(
        pt=pd_joined["lead_pt"],
        phi=pd_joined["lead_phi_nano"],
        eta=pd_joined["lead_eta_nano"],
        E=pd_joined["lead_energyRaw"]
    )

    four_sublead_nano = vector.obj(
        pt=pd_joined["sublead_pt"],
        phi=pd_joined["sublead_phi_nano"],
        eta=pd_joined["sublead_eta_nano"],
        E=pd_joined["sublead_energyRaw"]
    )

    pd_joined["deltaR_nano"] = four_lead_nano.deltaR(four_sublead_nano)

    four_lead_micro = vector.obj(
        pt=pd_joined["leadPt"],
        phi=pd_joined["lead_phi_micro"],
        eta=pd_joined["lead_eta_micro"],
        E=pd_joined["lead_SCRawE"]
    )

    four_sublead_micro = vector.obj(
        pt=pd_joined["subleadPt"],
        phi=pd_joined["sublead_phi_micro"],
        eta=pd_joined["sublead_eta_micro"],
        E=pd_joined["sublead_SCRawE"]
    )

    pd_joined["lead_deltaR"] = four_lead_nano.deltaR(four_lead_micro)
    pd_joined["sublead_deltaR"] = four_sublead_nano.deltaR(four_sublead_micro)
    pd_joined = pd_joined[pd_joined["lead_deltaR"] < deltaR_threshold]
    pd_joined = pd_joined[pd_joined["sublead_deltaR"] < deltaR_threshold]
    print("Final joined dataframe:\n{}".format(pd_joined))

    # Plot
    print("Start plotting")
    for column in columns:
        fig, (up, middle, down) = plt.subplots(
            nrows=3,
            ncols=1,
            gridspec_kw={"height_ratios": (2, 1, 1)}
            )

        nano_name = column["nano_col"]
        micro_name = column["micro_col"]

        if nano_name == micro_name:
            nano_name += "_nano"
            micro_name += "_micro"
        
        range = column["range"]

        # Up
        n, n_, n__ = up.hist(pd_joined[nano_name], bins=column["bins"], range=range, histtype="step", label="NanoAOD", linewidth=2)
        m, m_, m__ = up.hist(pd_joined[micro_name], bins=column["bins"], range=range, histtype="step", label="MicroAOD", linewidth=2)

        up.legend(fontsize=18, loc="upper right")
        up.set_xlim(range)
        up.set_xlabel(column["var"])
        up.set_ylabel("Events")
        if "log" in column:
            up.set_yscale("log")
        
        # Middle
        ylim = [0, 2]
        middle.set_ylim(ylim)
        #middle.axhline(1, xmin=range[0], xmax=range[1], color="black", alpha=0.6)
        centers = (n_[:-1] + n_[1:]) / 2
        middle.plot(centers, n / m, "k.")
        middle.set_xlim(range)
        middle.set_xlabel(column["var"])
        middle.set_ylabel("$n/\mu$")
        middle.grid(which="both")

        # Down
        perc_range = (-300, 300)
        perc_bins = 500
        down.hist(100 * (pd_joined[nano_name] - pd_joined[micro_name]) / pd_joined[micro_name], 
                  bins=perc_bins,
                  range=perc_range,
                  histtype="step",
                  density=True,
                  color="black",
                  linewidth=2)
        #down.set_yscale("log")
        down.set_xlabel("$(n_{ev} - \mu_{ev})/\mu_{ev}$ [%]")
        down.set_ylabel("Events / {}%".format((perc_range[1] - perc_range[0]) / perc_bins))

        print(column["nano_col"])
        print("nano: {}".format(np.sum(n)))
        print("micro: {}".format(np.sum(m)))
        print("diff = {}".format(abs(np.sum(n) - np.sum(m))))
        print("rel diff = {}%\n".format(100 * abs(np.sum(n) - np.sum(m)) / max(np.sum(n), np.sum(m))))

        fig.tight_layout()

        fig.savefig("{}/{}_{}.png".format(args.output_dir, column["nano_col"], args.sd), bbox_inches='tight')
        fig.savefig("{}/{}_{}.pdf".format(args.output_dir, column["nano_col"], args.sd), bbox_inches='tight')

        plt.close(fig)

    # Dump pandas dataframe to parquet file
    pd_joined.to_parquet("nano_micro_{}.parquet".format(args.sd), engine="fastparquet")
    print("Dumped dataframe to parquet file")

    # Redundant: dump separate dataframes for nano and micro with PhotonID inputs
    nano_vars = {
        "r9": "lead_r9_nano", 
        "s4": "lead_s4_nano",
        "sieie": "lead_sieie_nano",
        "etaWidth": "lead_etaWidth",
        "phiWidth": "lead_phiWidth",
        "sieip": "lead_sieip_nano",
        "pfPhoIso03": "lead_pfPhoIso03",
        "pfChargedIsoPFPV": "lead_pfChargedIsoPFPV",
        "pfChargedIsoWorstVtx": "lead_pfChargedIsoWorstVtx",

        "mva_ID": "lead_mvaID_recomputed"
        }

    micro_vars = {
        "r9": "lead_r9_micro", 
        "s4": "lead_s4_micro",
        "sieie": "lead_sieie_micro",
        "etaWidth": "lead_eta_width",
        "phiWidth": "lead_phi_width",
        "sieip": "lead_sieip_micro",
        "pfPhoIso03": "lead_pho_iso",
        "pfChargedIsoPFPV": "lead_ch_iso",
        "pfChargedIsoWorstVtx": "lead_ch_iso_worst",

        "mva_ID": "lead_mva"
        }

    nano_isos = {
        "pfPhoIso03": "lead_pfPhoIso03",
        "pfChargedIsoPFPV": "lead_pfChargedIsoPFPV",
        "pfChargedIsoWorstVtx": "lead_pfChargedIsoWorstVtx",
        "pfPhoIso03_uncorr": "lead_uncorr_pfPhoIso03",
        "pfChargedIsoPFPV_uncorr": "lead_uncorr_pfChargedIsoPFPV",
        "pfChargedIsoWorstVtx_uncorr": "lead_uncorr_pfChargedIsoWorstVtx",
        }

    micro_isos = {
        "pfPhoIso03": "lead_pho_iso",
        "pfChargedIsoPFPV": "lead_ch_iso",
        "pfChargedIsoWorstVtx": "lead_ch_iso_worst",
        "pfPhoIso03_uncorr": "lead_pho_iso_uncorr",
        "pfChargedIsoPFPV_uncorr": "lead_ch_iso_uncorr",
        "pfChargedIsoWorstVtx_uncorr": "lead_ch_iso_worst_uncorr",
       }

    nano_df = pd_joined[list(nano_vars.values())]
    nano_df.rename(columns=dict((v, k) for k, v in nano_vars.items()), inplace=True)
    nano_df.to_parquet("nano_{}.parquet".format(args.sd), engine="fastparquet")
    print("Dumped nano dataframe to parquet file")

    micro_df = pd_joined[list(micro_vars.values())]
    micro_df.rename(columns=dict((v, k) for k, v in micro_vars.items()), inplace=True)
    micro_df.to_parquet("micro_{}.parquet".format(args.sd), engine="fastparquet")
    print("Dumped micro dataframe to parquet file")

    nano_df = pd_joined[list(nano_isos.values())]
    nano_df.rename(columns=dict((v, k) for k, v in nano_isos.items()), inplace=True)
    nano_df.to_parquet("nano_{}_isos.parquet".format(args.sd), engine="fastparquet")
    print("Dumped nano dataframe for isos to parquet file")

    micro_df = pd_joined[list(micro_isos.values())]
    micro_df.rename(columns=dict((v, k) for k, v in micro_isos.items()), inplace=True)
    micro_df.to_parquet("micro_{}_isos.parquet".format(args.sd), engine="fastparquet")
    print("Dumped micro dataframe for isos to parquet file")
コード例 #20
0
    'MC_WZmu_el_pt_born',
    'MC_WZneutrino_pt_born',
    'MC_WZmu_el_phi_born',
    'MC_WZneutrino_phi_born',
    'MC_WZ_dilep_m_born',
    'mcChannelNumber',
    'weight_mc',
    'KFactor_weight_truth',
    'weight_pileup',
    'eventNumber',
]

# pull root data
t = time.time()
truth_df = to_pandas(
    uproot.concatenate(DATAFILE + ':truth', BRANCHES, num_workers=N_THREADS))
print(f"Importing from ROOT: {time.time() - t:.3f}s")\

# # delete duplicate events
# t = time.time()
# len_before = len(truth_df.index)
# truth_df.drop_duplicates('eventNumber', keep='first', inplace=True)
# print(f"Dropping duplicates: {time.time() - t:.3f}s ({len_before - len(truth_df.index)} duplicates found)")

# calculate sum of weights
t = time.time()
sumw = to_pandas(
    uproot.concatenate(DATAFILE + ':sumWeights',
                       ['dsid', 'totalEventsWeighted'],
                       num_workers=N_THREADS))
sumw = sumw.groupby('dsid').sum()