Esempio n. 1
0
def make_staged_target(stage_dir, target):
    if not isinstance(stage_dir, LocalDirectoryTarget):
        stage_dir = LocalDirectoryTarget(stage_dir)

    return stage_dir.child(target.unique_basename,
                           type=target.type,
                           **target._copy_kwargs())
Esempio n. 2
0
    def prepare_stageout(self):
        outputs = mask_struct(self.task.sandbox_stageout_mask(), self.task.output())
        if not outputs:
            return None

        # create a tmp dir
        tmp_dir = LocalDirectoryTarget(is_tmp=True)
        tmp_dir.touch()

        # map output files to local local targets in tmp_dir
        def map_target(target):
            return make_staged_target(tmp_dir, target)
        stage_outputs = map_struct(map_target, outputs)

        return StageInfo(outputs, tmp_dir, stage_outputs)
Esempio n. 3
0
    def stagein(self):
        inputs = mask_struct(self.task.sandbox_stagein_mask(), self.task.input())
        if not inputs:
            return None

        # create a tmp dir
        tmp_dir = LocalDirectoryTarget(is_tmp=True)
        tmp_dir.touch()

        # copy input files and map to local targets in tmp_dir
        def map_target(target):
            tmp_target = make_staged_target(tmp_dir, target)
            target.copy(tmp_target)
            return tmp_target
        stage_inputs = map_struct(map_target, inputs)

        return StageInfo(inputs, tmp_dir, stage_inputs)
Esempio n. 4
0
    def from_directory(cls, directory, **kwargs):
        # dir should be a FileSystemDirectoryTarget or a string, in which case it is interpreted as
        # a local path
        if isinstance(directory, six.string_types):
            d = LocalDirectoryTarget(directory)
        elif isinstance(d, FileSystemDirectoryTarget):
            d = directory
        else:
            raise TypeError("directory must either be a string or a FileSystemDirectoryTarget "
                "object, got '{}'".format(directory))

        # find all files, pass kwargs which may filter the result further
        kwargs["type"] = "f"
        basenames = d.listdir(**kwargs)

        # convert to file targets
        targets = [d.child(basename, type="f") for basename in basenames]

        return cls(targets)
Esempio n. 5
0
    def localize(self, *args, **kwargs):
        # when localizing collections using temporary files, it makes sense to put
        # them all in the same temporary directory
        tmp_dir = kwargs.get("tmp_dir")
        if not tmp_dir:
            tmp_dir = LocalDirectoryTarget(is_tmp=True)
        kwargs["tmp_dir"] = tmp_dir

        # enter localize contexts of all targets
        with localize_file_targets(self.targets, *args, **kwargs) as localized_targets:
            # create a copy of this collection that wraps the localized targets
            yield self.__class__(localized_targets, **self._copy_kwargs())
Esempio n. 6
0
    def run(self):
        # before_run hook
        if callable(self.task.sandbox_before_run):
            self.task.sandbox_before_run()

        # create a temporary direction for file staging
        tmp_dir = LocalDirectoryTarget(is_tmp=True)
        tmp_dir.touch()

        # stage-in input files
        stagein_info = self.stagein(tmp_dir)
        if stagein_info:
            # tell the sandbox
            self.sandbox_inst.stagein_info = stagein_info
            logger.debug("configured sandbox stage-in data")

        # prepare stage-out
        stageout_info = self.prepare_stageout(tmp_dir)
        if stageout_info:
            # tell the sandbox
            self.sandbox_inst.stageout_info = stageout_info
            logger.debug("configured sandbox stage-out data")

        # create the actual command to run
        cmd = self.sandbox_inst.cmd(self.proxy_cmd())

        # run with log section before and after actual run call
        with self._run_log(cmd):
            code, out, err = self.sandbox_inst.run(cmd)
            if code != 0:
                raise Exception("Sandbox '{}' failed with exit code {}".format(
                    self.sandbox_inst.key, code))

        # actual stage_out
        if stageout_info:
            self.stageout(stageout_info)

        # after_run hook
        if callable(self.task.sandbox_after_run):
            self.task.sandbox_after_run()
Esempio n. 7
0
File: remote.py Progetto: silky/law
 def localize(self,
              path=None,
              is_tmp=True,
              skip_parent=False,
              mode=0o0770,
              cache=True,
              retry=None):
     tmp = LocalDirectoryTarget(path, is_tmp=is_tmp)
     try:
         yield tmp
         if not skip_parent:
             self.parent.touch()
         # determine files to put
         lpaths, rpaths = [], []
         for base, dirs, files in tmp.walk():
             for f in files:
                 full_f = os.path.join(base, f)
                 lpaths.append(full_f)
                 rpaths.append(os.path.relpath(full_f, tmp.path))
         self.put(lpaths, rpaths, cache=cache, retry=retry)
         self.chmod(mode, retry=retry)
     finally:
         del tmp
Esempio n. 8
0
    def localize(self, *args, **kwargs):
        # when localizing collections using temporary files, it makes sense to put
        # them all in the same temporary directory
        tmp_dir = kwargs.get("tmp_dir")
        tmp_dir_created = False
        if not tmp_dir:
            tmp_dir = LocalDirectoryTarget(is_tmp=True)
            tmp_dir_created = True
            kwargs["tmp_dir"] = tmp_dir.path

        # enter localize contexts of all targets
        with localize_file_targets(self.targets, *args,
                                   **kwargs) as localized_targets:
            # create a copy of this collection that wraps the localized targets
            copy = self.__class__(localized_targets, **self._copy_kwargs())

            try:
                yield copy

            finally:
                # although tmp_dir would clean itself during garbage collection, an error might have
                # occurred, so for larger collections it is safer to delete the tmp_dir manually
                if tmp_dir_created:
                    tmp_dir.remove()
Esempio n. 9
0
    def run(self):
        import ROOT

        ROOT.PyConfig.IgnoreCommandLineOptions = True
        ROOT.gROOT.SetBatch()

        inp = self.input()
        outp = self.output()

        local_tmp = LocalDirectoryTarget(is_tmp=True)
        local_tmp.touch()

        plots = {}

        if self.norm_to_nominal and self.shifts[0] != "nominal":
            raise KeyError("'norm_to_nominal' is set to true, but no nominal values found.")

        for color_idx, (config, config_input) in enumerate(inp.items()):
            b_tagger, iteration, version = config

            config_ids = [b_tagger]
            if len(self.iterations) > 1:
                config_ids.append("iteration {}".format(iteration))
            if len(self.versions) > 1:
                config_ids.append("version {}".format(version))
            config_id = ", ".join(config_ids)

            nominal_hists = {}
            nominal_fit_hists = {}
            # combined errors for multiple shifts
            up_shifted_hists = defaultdict(dict)
            up_shifted_fit_hists = defaultdict(dict)
            down_shifted_hists = defaultdict(dict)
            down_shifted_fit_hists = defaultdict(dict)

            if self.fix_normalization:
                normalization_input = config_input.pop("norm")
            for shift_idx, (shift, inp_target) in enumerate(config_input.items()):
                # get scaling factors for normalization
                if self.fix_normalization:
                    norm_factors = normalization_input.load()[shift]

                with inp_target["fit"]["sf"].load("r") as fit_file, \
                        inp_target["hist"]["scale_factors"].load("r") as hist_file:
                    for category_key in fit_file.GetListOfKeys():
                        category_name = category_key.GetName()
                        if not self.config_inst.has_category(category_name):
                            raise KeyError("Unknown category {}".format(category_name))

                        category = self.config_inst.get_category(category_name)
                        pt_range = category.get_aux("pt")
                        eta_range = category.get_aux("eta")
                        region = category.get_aux("region")

                        # same category name for different b-taggers
                        if len(self.b_taggers) > 1:
                            plot_category = category.name.replace("__" + b_tagger, "")
                        else:
                            plot_category = category.name

                        fit_category_dir = fit_file.Get(category_name)
                        fit_hist = fit_category_dir.Get(self.hist_name)

                        hist_category_dir = hist_file.Get(category_name)
                        hist = hist_category_dir.Get(self.hist_name)
                        # truncate first bin
                        hist = self.rebin_hist(hist, region, b_tagger=b_tagger, truncate=True)

                        # normalize histogram if required
                        # fit histograms are already normalized in FitScaleFactors
                        if self.fix_normalization and not self.is_c_flavour:
                            hist.Scale(norm_factors[category_name])

                        if shift == "nominal":
                            # make sure histograms are not cleaned up when the file is closed
                            nominal_fit_hists[plot_category] = fit_hist.Clone()
                            nominal_fit_hists[plot_category].SetDirectory(0)

                            nominal_hists[plot_category] = hist.Clone()
                            nominal_hists[plot_category].SetDirectory(0)

                        # for c-jets, there is no nominal histogram
                        # Instead, all nominal values are set to 1
                        if shift_idx == 0 and self.is_c_flavour:
                            nominal_fit_hist = fit_hist.Clone()
                            for bin_idx in range(1, nominal_fit_hist.GetNbinsX() + 1):
                                nominal_fit_hist.SetBinContent(bin_idx, 1.0)

                            nominal_fit_hist.SetDirectory(0)
                            nominal_fit_hists[plot_category] = nominal_fit_hist

                        if shift != "nominal" and self.multiple_shifts:
                            # collect all shifted fit histograms to build envelope later
                            sys, direction = shift.rsplit("_", 1)
                            if direction == "up":
                                up_shifted_fit_hists[plot_category][sys] = fit_hist.Clone()
                                up_shifted_fit_hists[plot_category][sys].SetDirectory(0)
                                up_shifted_hists[plot_category][sys] = hist.Clone()
                                up_shifted_hists[plot_category][sys].SetDirectory(0)
                            elif direction == "down":
                                down_shifted_fit_hists[plot_category][sys] = fit_hist.Clone()
                                down_shifted_fit_hists[plot_category][sys].SetDirectory(0)
                                down_shifted_hists[plot_category][sys] = hist.Clone()
                                down_shifted_hists[plot_category][sys].SetDirectory(0)
                            else:
                                raise ValueError("Unknown direction {}".format(direction))

                        if self.norm_to_nominal:
                            fit_hist.Divide(nominal_fit_hists[plot_category])

                        # get same category key for all b-taggers
                        if plot_category in plots:
                            plot = plots[plot_category]
                        else:
                            plot = ROOTPlot(category.name, category.name)
                            plot.create_pads()
                            plots[plot_category] = plot
                        plot.cd(0, 0)
                        fit_hist.GetXaxis().SetRangeUser(-.1, 1.0)
                        y_min = 0.6 if self.norm_to_nominal else 0.
                        y_max = 1.4 if self.norm_to_nominal else 2.
                        fit_hist.GetYaxis().SetRangeUser(y_min, y_max)

                        if len(self.b_taggers) == 1:
                            title = self.config_inst.get_aux("btaggers")[b_tagger]["label"] + " discriminator"
                        else:
                            title = "B-Tag Discriminant"

                        fit_hist.GetXaxis().SetTitle(title)
                        fit_hist.GetYaxis().SetTitle("SF")

                        if shift_idx == 0:
                            if not self.multiple_shifts or shift == "nominal":
                                # only draw this fit histogram if it is not part of a shifted envelope
                                plot.draw({"sf": fit_hist}, line_color=1, add_to_legend=False)
                            line = ROOT.TLine(0., 0., 0., 2.)
                            line.SetLineStyle(9)
                            plot.draw({"line": line}, add_same_option=False, line_color=1, add_to_legend=False)

                            # add category information to plot
                            if not np.isinf(pt_range[1]):
                                text = r"#splitline{%d < p_{T} < %d}{%.1f < |#eta| < %.1f}" % \
                                    (pt_range[0], pt_range[1], eta_range[0], eta_range[1])
                            else:
                                text = r"#splitline{p_{T} > %d}{%.1f < |#eta| < %.1f}" % \
                                    (pt_range[0], eta_range[0], eta_range[1])
                            plot.draw_text(text)
                        elif not self.multiple_shifts:
                            plot.draw({shift: fit_hist}, line_color=None)

                        if shift == "nominal" and not self.norm_to_nominal:
                            plot.draw({config_id + ", nominal": hist}, line_color=1,
                                add_to_legend=(len(self.shifts) != 1))

            if self.multiple_shifts:
                for plot_category in plots:
                    plot = plots[plot_category]
                    plot.cd(0, 0)

                    # build shifted histograms
                    fit_hist_down, fit_hist_up = build_hist_envelope(nominal_fit_hists[plot_category],
                        up_shifted_fit_hists[plot_category], down_shifted_fit_hists[plot_category],
                        envelope_as_errors=False)

                    hist_down, hist_up = build_hist_envelope(nominal_hists[plot_category],
                        up_shifted_hists[plot_category], down_shifted_hists[plot_category],
                        envelope_as_errors=False)

                    if self.norm_to_nominal:
                        fit_hist_up.Divide(nominal_fit_hists[plot_category])
                        fit_hist_down.Divide(nominal_fit_hists[plot_category])
                        hist_up.Divide(nominal_hists[plot_category])
                        hist_down.Divide(nominal_hists[plot_category])

                    plot.draw({config_id + ", up": fit_hist_up}, line_color=None)
                    plot.draw({config_id + ", down": fit_hist_down}, line_color=None)
                    plot.draw({config_id + ", up": hist_up}, line_color=2, options=["hist"])
                    plot.draw({config_id + ", down": hist_down}, line_color=4, options=["hist"])

        # save plots
        for plot_category in plots:
            plot = plots[plot_category]
            plot_name = self.get_plot_name(plot_category, self.shifts_identifier, self.b_taggers[0],
                self.iterations[0])
            plot.save(os.path.join(local_tmp.path, plot_name), draw_legend=True,
                lumi=self.config_inst.get_aux("lumi").values()[0] / 1000.)
            del plot

        with outp.localize("w") as tmp:
            with tarfile.open(tmp.path, "w:gz") as tar:
                for plot_file in os.listdir(local_tmp.path):
                    tar.add(os.path.join(local_tmp.path, plot_file), arcname=plot_file)
Esempio n. 10
0
    def env(self):
        # strategy: unlike docker, singularity might not allow binding of paths that do not exist
        # in the container, so create a tmp directory on the host system and bind it as /tmp, let
        # python dump its full env into a file, and read the file again on the host system
        if self.image not in self._envs:
            tmp_dir = LocalDirectoryTarget(is_tmp=True)
            tmp_dir.touch()

            tmp = tmp_dir.child("env", type="f")
            tmp.touch()

            # determine whether volume binding is allowed
            allow_binds_cb = getattr(self.task, "singularity_allow_binds",
                                     None)
            if callable(allow_binds_cb):
                allow_binds = allow_binds_cb()
            else:
                cfg = Config.instance()
                allow_binds = cfg.get_expanded(self.get_config_section(),
                                               "allow_binds")

            # arguments to configure the environment
            args = ["-e"]
            if allow_binds:
                args.extend(["-B", "{}:/tmp".format(tmp_dir.path)])
                env_file = "/tmp/{}".format(tmp.basename)
            else:
                env_file = tmp.path

            # get the singularity exec command
            singularity_exec_cmd = self._singularity_exec_cmd() + args

            # build commands to setup the environment
            setup_cmds = self._build_setup_cmds(self._get_env())

            # build the python command that dumps the environment
            py_cmd = "import os,pickle;" \
                + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(env_file)

            # build the full command
            cmd = quote_cmd(singularity_exec_cmd + [
                self.image,
                "bash",
                "-l",
                "-c",
                "; ".join(
                    flatten(setup_cmds, quote_cmd(["python", "-c", py_cmd]))),
            ])

            # run it
            code, out, _ = interruptable_popen(cmd,
                                               shell=True,
                                               executable="/bin/bash",
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.STDOUT)
            if code != 0:
                raise Exception(
                    "singularity sandbox env loading failed:\n{}".format(out))

            # load the environment from the tmp file
            env = tmp.load(formatter="pickle")

            # cache
            self._envs[self.image] = env

        return self._envs[self.image]
Esempio n. 11
0
    def run(self):
        ROOT.PyConfig.IgnoreCommandLineOptions = True
        ROOT.gROOT.SetBatch()

        ROOT.gSystem.Load('libCondFormatsBTauObjects')
        ROOT.gSystem.Load('libCondToolsBTau')

        local_tmp = LocalDirectoryTarget(is_tmp=True)
        local_tmp.touch()

        jes_sources = self.config_inst.get_aux("jes_sources")
        shifts = []
        if self.shift == "ALL":
            if self.flavor == "c":
                shifts.extend(["cferr1", "cferr2"])
            else:
                shifts.extend([
                    "jes{}".format(jes_source) for jes_source in jes_sources
                    if jes_source != "Total"
                ])
                shifts.extend([
                    "{}{}".format(region, type) for region, type in
                    itertools.product(["lf", "hf"], ["", "stats1", "stats2"])
                ])
        elif self.shift == "NONE":
            shifts = []
        else:
            shifts = [self.shift]

        v_sys = getattr(ROOT, 'vector<string>')()
        for shift in shifts:
            v_sys.push_back("up_" + shift)
            v_sys.push_back("down_" + shift)

        flavor_ids = self.config_inst.get_aux("flavor_ids")
        binning = self.config_inst.get_aux("binning")[self.flavor]

        pt_binning = [
            (start, end)
            for start, end in zip(binning["pt"][:-1], binning["pt"][1:])
        ]
        eta_binning = [(start, end) for start, end in zip(
            binning["abs(eta)"][:-1], binning["abs(eta)"][1:])]

        figures = {}
        if self.compare_file is None:
            csv_files, descriptions = [self.csv_file], ["csv"]
        else:
            csv_files, descriptions = [self.csv_file,
                                       self.compare_file], ["new", "old"]
        for input_file, id in zip(csv_files, descriptions):
            # create calibration reader
            calib = ROOT.BTagCalibration("csv_{}".format(id), input_file)
            reader = ROOT.BTagCalibrationReader(
                3,  # 0 is for loose op, 1: medium, 2: tight, 3: discr. reshaping
                "central",  # central systematic type
                v_sys,  # vector of other sys. types
            )
            for jetFlavor in [0, 1, 2]:
                reader.load(
                    calib,
                    jetFlavor,  # 0 is for b flavour, 1: FLAV_C, 2: FLAV_UDSG
                    "iterativefit"  # measurement type
                )

            for pt_idx, pt_range in enumerate(pt_binning):
                for eta_idx, eta_range in enumerate(eta_binning):
                    key = pt_range + eta_range
                    if key not in figures:
                        fig = plt.figure()
                        ax = fig.add_subplot(111)
                        ax.set_title("pt: %s to %s, eta: %.1f to %.1f" %
                                     (pt_range + eta_range))
                    else:
                        fig, ax = figures[key]

                    if pt_range[1] == np.inf:
                        pt_val = pt_range[0] + 1
                    else:
                        pt_val = np.mean(pt_range)
                    eta_val = np.mean(eta_range)

                    def get_values(csv_reader, sys_type):
                        x_values = np.linspace(-0.1, 1., 10000)
                        y_values = []
                        for csv_value in x_values:
                            sf = csv_reader.eval_auto_bounds(
                                sys_type,  # systematic (here also 'up'/'down' possible)
                                flavor_ids[self.flavor],  # jet flavor
                                eta_val,  # absolute value of eta
                                pt_val,  # pt
                                csv_value)
                            y_values.append(sf)
                        return np.array(x_values), np.array(y_values)

                    x_values, nominal_values = get_values(reader, "central")
                    if not self.norm_to_nominal:
                        ax.plot(x_values,
                                nominal_values,
                                label="{}, {}".format(id, "nominal"))

                    if self.shift != "NONE":
                        total_errors_up = np.zeros(nominal_values.shape)
                        total_errors_down = np.zeros(nominal_values.shape)
                        for shift in shifts:
                            _, up_values = get_values(reader, "up_" + shift)
                            _, down_values = get_values(
                                reader, "down_" + shift)

                            if len(shifts) > 1:  # build envelope
                                diff_up = up_values - nominal_values
                                diff_down = down_values - nominal_values

                                # shift with effect in up/down direction
                                errors_up = np.max([
                                    diff_up, diff_down,
                                    np.zeros(nominal_values.shape)
                                ],
                                                   axis=0)
                                errors_down = np.min([
                                    diff_up, diff_down,
                                    np.zeros(nominal_values.shape)
                                ],
                                                     axis=0)

                                # add in quadrature
                                total_errors_up += errors_up**2
                                total_errors_down += errors_down**2
                        total_errors_up = total_errors_up**0.5
                        total_errors_down = total_errors_down**0.5

                        if len(shifts) > 1:
                            up_values = nominal_values + total_errors_up
                            down_values = nominal_values - total_errors_down
                        if self.norm_to_nominal:
                            up_values /= nominal_values
                            down_values /= nominal_values
                        ax.plot(x_values,
                                up_values,
                                label="{}, {}".format(id, "up_" + self.shift))
                        ax.plot(x_values,
                                down_values,
                                label="{}, {}".format(id, "down" + self.shift))

                    if self.compare_file is None:
                        if self.flavor in ["c", "hf"]:
                            input_file = self.root_hf_file
                        elif self.flavor == "lf":
                            input_file = self.root_lf_file
                        else:
                            raise Exception("No .root file for c flavor SFs.")

                        root_file = ROOT.TFile.Open(input_file)

                        func_name = "csv_ratio_Pt{}_Eta{}_final".format(
                            pt_idx, eta_idx)
                        if self.flavor == "c":
                            func_name = "c_" + func_name

                        func = root_file.Get(func_name)

                        y_values = []
                        x_values = np.linspace(-0.1, 1., 10000)
                        for csv_value in x_values:
                            y_val = func.Eval(csv_value)
                            y_values.append(y_val)

                        ax.plot(x_values,
                                y_values,
                                label="{}, {}".format(".root", "nominal"))

                    figures[key] = (fig, ax)

            del reader
            del calib

        for key, (fig, ax) in figures.items():
            ax.legend(loc="lower right")
            ax.set_ylim(0., 2.)
            fig.savefig(
                os.path.join(
                    local_tmp.path, "SF_%s_%s_Pt%sTo%s_eta%.1fTo%.1f.pdf" %
                    ((self.flavor, self.shift) + key)))

        with self.output().localize("w") as tmp:
            with tarfile.open(tmp.path, "w:gz") as tar:
                for plot_file in os.listdir(local_tmp.path):
                    tar.add(os.path.join(local_tmp.path, plot_file),
                            arcname=plot_file)
Esempio n. 12
0
    def run(self):
        ROOT.PyConfig.IgnoreCommandLineOptions = True
        ROOT.gROOT.SetBatch()

        local_tmp = LocalDirectoryTarget(is_tmp=True)
        local_tmp.touch()

        if self.flavor == "hf":
            input_file = self.hf_file
        else:
            input_file = self.lf_file

        root_file = ROOT.TFile.Open(input_file)

        hist_tpl = "h_csv_ratio_Pt{}_Eta{}_final"

        binning = self.config_inst.get_aux("binning")[self.flavor]
        n_pt_categories = len(binning["pt"]) - 1
        n_eta_categories = len(binning["abs(eta)"]) - 1
        hist_names = [
            hist_tpl.format(pt_idx, eta_idx)
            for pt_idx in range(n_pt_categories)
            for eta_idx in range(n_eta_categories)
        ]

        ##
        ## Plot histograms with total systematic envelope
        ##

        for hist_name in hist_names:
            nominal_hist = root_file.Get(hist_name)

            errors_up = []
            errors_down = []
            # collect all shifts from file
            if self.shift == "ALL":
                shifts = [
                    key.GetName().split("_")[-1]
                    for key in root_file.GetListOfKeys()
                    if key.GetName().startswith(hist_name)
                ]
                shifts = set(
                    [shift[:-2] for shift in shifts if shift[-2:] == "Up"])
            else:
                shifts = [self.shift]
            for shift_idx, shift in enumerate(shifts):
                hist_name_up = hist_name + "_" + shift + "Up"
                hist_name_down = hist_name + "_" + shift + "Down"
                hist_up = root_file.Get(hist_name_up)
                hist_down = root_file.Get(hist_name_down)

                for bin_idx in range(1, nominal_hist.GetNbinsX() + 1):
                    nominal_value = nominal_hist.GetBinContent(bin_idx)

                    # combine all shifts that have an effect in the same direction
                    # effect from <shift>_up/done systematics
                    diff_up = hist_up.GetBinContent(bin_idx) - nominal_value
                    diff_down = hist_down.GetBinContent(
                        bin_idx) - nominal_value

                    # detect systematics where up/down shift direction is the same
                    #if diff_up * diff_down > 0:
                    #    print "One sided shift: {}, {}".format(shift, category)

                    # if multiple shifts, build envelope
                    if len(shifts) != 1:
                        # shift with effect in up/down direction
                        error_up = max([diff_up, diff_down, 0])
                        error_down = min([diff_up, diff_down, 0])

                        # add in quadrature
                        if shift_idx == 0:
                            errors_up.append(error_up**2)
                            errors_down.append(error_down**2)
                        else:
                            errors_up[bin_idx - 1] += error_up**2
                            errors_down[bin_idx - 1] += error_down**2
                    else:
                        errors_up.append(diff_up)
                        errors_down.append(-diff_down)  # is subtracted later
            # multiple shifts have been added quadratically, take square root
            if len(shifts) != 1:
                errors_up = np.sqrt(errors_up)
                errors_down = np.sqrt(errors_down)

            # build shifted histograms
            combined_hist_up = nominal_hist.Clone()
            combined_hist_down = nominal_hist.Clone()

            for bin_idx in range(1, nominal_hist.GetNbinsX() + 1):
                combined_hist_up.SetBinContent(
                    bin_idx,
                    combined_hist_up.GetBinContent(bin_idx) +
                    errors_up[bin_idx - 1])
                combined_hist_down.SetBinContent(
                    bin_idx,
                    combined_hist_down.GetBinContent(bin_idx) -
                    errors_down[bin_idx - 1])

            if self.norm_to_nominal:
                combined_hist_up.Divide(nominal_hist)
                combined_hist_down.Divide(nominal_hist)

            plot = ROOTPlot(hist_name, hist_name)
            plot.create_pads()
            plot.cd(0, 0)
            plot.draw({"nominal": nominal_hist}, line_color=1)
            plot.draw({"up": combined_hist_up}, line_color=2)
            plot.draw({"down": combined_hist_down}, line_color=4)

            plot.save(os.path.join(local_tmp.path, "{}.pdf".format(hist_name)))

        ##
        ## Check scale factors, uncertainties, and fits
        ##

        for i, (pt_idx, eta_idx) in enumerate(
                itertools.product(range(n_pt_categories),
                                  range(n_eta_categories))):
            data_hist = root_file.Get("h_csv_Data_Pt{}_Eta{}".format(
                pt_idx, eta_idx))
            if self.flavor == "hf":
                signal_base = "h_csv_MC_bjets"
                bg_base = "h_csv_MC_nonbjets"
            else:
                signal_base = "h_csv_MC_nonbjets"  # actually lf
                bg_base = "h_csv_MC_bjets"  # actually b + c

            signal_hist = root_file.Get("{}_Pt{}_Eta{}".format(
                signal_base, pt_idx, eta_idx))
            bg_hist = root_file.Get("{}_Pt{}_Eta{}".format(
                bg_base, pt_idx, eta_idx))

            #

        with self.output().localize("w") as tmp:
            with tarfile.open(tmp.path, "w:gz") as tar:
                for plot_file in os.listdir(local_tmp.path):
                    tar.add(os.path.join(local_tmp.path, plot_file),
                            arcname=plot_file)
Esempio n. 13
0
def merge_parquet_task(task,
                       inputs,
                       output,
                       local=False,
                       cwd=None,
                       force=True,
                       writer_opts=None):
    """
    This method is intended to be used by tasks that are supposed to merge parquet files, e.g. when
    inheriting from :py:class:`law.contrib.tasks.MergeCascade`. *inputs* should be a sequence of
    targets that represent the files to merge into *output*. When *local* is *False* and files need
    to be copied from remote first, *cwd* can be a set as the dowload directory. When empty, a
    temporary directory is used. The *task* itself is used to print and publish messages via its
    :py:meth:`law.Task.publish_message` and :py:meth:`law.Task.publish_step` methods. When *force*
    is *True*, any existing output file is overwritten. *writer_opts* is forwarded to
    :py:func:`merge_parquet_files` which is used internally for the actual merging.
    """
    # ensure inputs are targets
    inputs = [
        LocalFileTarget(inp) if isinstance(inp, six.string_types) else inp
        for inp in inputs
    ]

    # ensure output is a target
    if isinstance(output, six.string_types):
        output = LocalFileTarget(output)

    def merge(inputs, output):
        with task.publish_step("merging {} parquet files ...".format(
                len(inputs)),
                               runtime=True):
            # clear the output if necessary
            if output.exists() and force:
                output.remove()

            if len(inputs) == 1:
                output.copy_from_local(inputs[0])
            else:
                # merge
                merge_parquet_files([inp.path for inp in inputs],
                                    output.path,
                                    writer_opts=writer_opts)

        # print the size
        output_size = human_bytes(output.stat().st_size, fmt=True)
        task.publish_message(f"merged file size: {output_size}")

    if local:
        # everything is local, just merge
        merge(inputs, output)

    else:
        # when not local, we need to fetch files first into the cwd
        if not cwd:
            cwd = LocalDirectoryTarget(is_tmp=True)
        elif isinstance(cwd, str):
            cwd = LocalDirectoryTarget(cwd)
        cwd.touch()

        # fetch
        with task.publish_step("fetching inputs ...", runtime=True):

            def fetch(inp):
                local_inp = cwd.child(inp.unique_basename, type="f")
                inp.copy_to_local(local_inp, cache=False)
                return local_inp

            def callback(i):
                task.publish_message("fetch file {} / {}".format(
                    i + 1, len(inputs)))

            local_inputs = map_verbose(fetch,
                                       inputs,
                                       every=5,
                                       callback=callback)

        # merge into a localized output
        with output.localize("w", cache=False) as local_output:
            merge(local_inputs, local_output)
Esempio n. 14
0
File: util.py Progetto: meliache/law
def hadd_task(task, inputs, output, cwd=None, local=False, force=True):
    """
    This method is intended to be used by tasks that are supposed to merge root files, e.g. when
    inheriting from :py:class:`law.contrib.tasks.MergeCascade`. *inputs* should be a sequence of
    local targets that represent the files to merge into *output*. *cwd* is the working directory
    in which hadd is invoked. When empty, a temporary directory is used. The *task* itself is
    used to print and publish messages via its :py:meth:`law.Task.publish_message` and
    :py:meth:`law.Task.publish_step` methods.

    When *local* is *True*, the input and output targets are assumed to be local and the merging is
    based on their local paths. Otherwise, the targets are fetched first and the output target is
    localized.

    When *force* is *True*, any existing output file is overwritten (by adding the ``-f`` flag to
    ``hadd``).
    """
    # ensure inputs are targets
    inputs = [
        LocalFileTarget(inp) if isinstance(inp, six.string_types) else inp
        for inp in inputs
    ]

    # ensure output is a target
    if isinstance(output, six.string_types):
        output = LocalFileTarget(output)

    # default cwd
    if not cwd:
        cwd = LocalDirectoryTarget(is_tmp=True)
    elif isinstance(cwd, six.string_types):
        cwd = LocalDirectoryTarget(cwd)
    cwd.touch()

    # helper to create the hadd cmd
    def hadd_cmd(input_paths, output_path):
        cmd = ["hadd", "-n", "0"]
        if force:
            cmd.append("-f")
        cmd.extend(["-d", cwd.path])
        cmd.append(output_path)
        cmd.extend(input_paths)
        return quote_cmd(cmd)

    if local:
        # when local, there is no need to download inputs
        input_paths = [inp.path for inp in inputs]

        with task.publish_step("merging ...", runtime=True):
            if len(inputs) == 1:
                output.copy_from_local(inputs[0])
            else:
                # merge using hadd
                cmd = hadd_cmd(input_paths, output.path)
                code = interruptable_popen(cmd, shell=True, executable="/bin/bash")[0]
                if code != 0:
                    raise Exception("hadd failed")

        task.publish_message("merged file size: {}".format(human_bytes(
            output.stat.st_size, fmt=True)))

    else:
        # when not local, we need to fetch files first into the cwd
        with task.publish_step("fetching inputs ...", runtime=True):
            def fetch(inp):
                inp.copy_to_local(cwd.child(inp.unique_basename, type="f"), cache=False)
                return inp.unique_basename

            def callback(i):
                task.publish_message("fetch file {} / {}".format(i + 1, len(inputs)))

            bases = map_verbose(fetch, inputs, every=5, callback=callback)

        # start merging into the localized output
        with output.localize("w", cache=False) as tmp_out:
            with task.publish_step("merging ...", runtime=True):
                if len(bases) == 1:
                    tmp_out.path = cwd.child(bases[0]).path
                else:
                    # merge using hadd
                    cmd = hadd_cmd(bases, tmp_out.path)
                    code = interruptable_popen(cmd, shell=True, executable="/bin/bash",
                        cwd=cwd.path)[0]
                    if code != 0:
                        raise Exception("hadd failed")

                    task.publish_message("merged file size: {}".format(human_bytes(
                        tmp_out.stat.st_size, fmt=True)))
Esempio n. 15
0
 def htcondor_output_directory(self):
     # the directory where submission meta data should be stored
     return LocalDirectoryTarget(self.local_path())
Esempio n. 16
0
    def run(self):
        def add_hist(hist, new_hist, sign=1.):
            if hist is None:
                hist = new_hist.Clone()
                hist.Scale(sign)
            else:
                hist.Add(new_hist, sign)
            return hist

        import ROOT

        ROOT.PyConfig.IgnoreCommandLineOptions = True
        ROOT.gROOT.SetBatch()

        inp = self.input()
        outp = self.output()

        if self.normalize:
            scales = inp["scale"]["channel_scales"].load()

        local_tmp = LocalDirectoryTarget(is_tmp=True)
        local_tmp.touch()

        categories = []
        for category, _, _ in self.config_inst.walk_categories():
            if category.has_tag((self.category_tag, self.b_tagger), mode=all):
                categories.append(category)

        # create plot objects
        plot_dict = {}
        for category in categories:
            plot = ROOTPlot(category.name, category.name)
            plot.create_pads(n_pads_y=2, limits_y=[0., 0.3, 1.0], legend_loc="upper")
            plot_dict[category] = plot

        with inp["hists"].load("r") as input_file:
            for category in categories:
                data_hist = None
                mc_hists = defaultdict(lambda: defaultdict(lambda: None)) # shift -> key (process/flavor)

                for leaf_cat, _, children in category.walk_categories():
                    # we are only interested in leaves
                    if children:
                        continue

                    flavor = leaf_cat.get_aux("flavor", None)
                    channel = leaf_cat.get_aux("channel")
                    region = leaf_cat.get_aux("region", None)

                    category_dir = input_file.GetDirectory(leaf_cat.name)
                    for process_key in category_dir.GetListOfKeys():
                        process = self.config_inst.get_process(process_key.GetName())
                        process_dir = category_dir.GetDirectory(process.name)

                        # avoid double counting of inclusive and flavor-dependent histograms
                        if flavor is not None:  # Not needed in case region isn't flavor specific
                            if process.is_data and flavor != "inclusive":
                                continue
                            elif process.is_mc and flavor == "inclusive":
                                continue

                        for shift in self.shifts:
                            if process.is_data and shift != "nominal":
                                continue
                            for variable in self.variable:
                                # create variable name from template
                                aux = leaf_cat.aux.copy()
                                aux["b_tag_var"] = self.config_inst.get_aux("btaggers")[self.b_tagger]["variable"]
                                aux["b_tagger"] = self.b_tagger
                                aux["shift"] = shift
                                variable = variable.format(**aux)

                                hist = process_dir.Get(variable)

                                binning_type = "measurement" if self.rebin else None
                                hist = self.rebin_hist(hist, region, binning_type=binning_type, truncate=self.truncate)

                                add_to_data, sign = self.associate_hist(process=process, flavor=flavor, region=region)
                                if add_to_data:
                                    if shift != "nominal":
                                        raise Exception("Cannot add shifted samples to data.")
                                    data_hist = add_hist(data_hist, hist, sign=sign)
                                else:
                                    if self.normalize and region is not None:  # apply "trigger" sfs as part of the normalization
                                        hist.Scale(scales[channel.name][region])

                                    key = process if self.mc_split == "process" else flavor
                                    mc_hists[shift][key] = add_hist(mc_hists[shift][key], hist, sign=sign)

                if self.normalize:  # normalize mc yield to data in this category
                    mc_yield = sum(hist.Integral() for hist in mc_hists["nominal"].values())
                    data_yield = data_hist.Integral()
                    norm_factor = data_yield / mc_yield
                    for shift in self.shifts:
                        for mc_hist in mc_hists[shift].values():
                            mc_hist.Scale(norm_factor)

                # get maximum value of hists/ stacks drawn to set axis ranges
                mc_hist_sum = mc_hists["nominal"].values()[0].Clone()

                for mc_hist in mc_hists["nominal"].values()[1:]:
                    mc_hist_sum.Add(mc_hist)
                hist_maximum = max([mc_hist_sum.GetMaximum(), data_hist.GetMaximum()])

                # get plot names
                mc_key = self.mc_key.format(**{"region": category.get_aux("region", None)})
                data_key = self.data_key.format(**{"region": category.get_aux("region", None)})

                plot = plot_dict[category]
                # data and mc histograms
                plot.cd(0, 1)
                if self.draw_stacked:
                    plot.draw(mc_hists["nominal"], stacked=True, stack_maximum=1.5*hist_maximum, y_title="Entries")
                else:
                    # fix axis range
                    invis_hist = mc_hist_sum.Clone() if mc_hist_sum.GetMaximum() > data_hist.GetMaximum() else data_hist.Clone()
                    invis_hist.Scale(1.5)
                    plot.draw({"invis": invis_hist}, invis=True)
                    plot.draw({mc_key: mc_hist_sum}, line_color=None)
                plot.draw({data_key: data_hist})

                if self.draw_systematics:
                    up_shifted_mc_hists = {}
                    down_shifted_mc_hists = {}
                    for shift in self.shifts:
                        # combine processes/ flavors
                        shifted_mc_hist_sum = mc_hists[shift].values()[0].Clone()
                        for mc_hist in mc_hists[shift].values()[1:]:
                            shifted_mc_hist_sum.Add(mc_hist)

                        if shift.endswith("_down"):
                            down_shifted_mc_hists[shift[:-5]] = shifted_mc_hist_sum.Clone()
                        elif shift.endswith("_up"):
                            up_shifted_mc_hists[shift[:-3]] = shifted_mc_hist_sum.Clone()

                    envelope = build_hist_envelope(mc_hist_sum, up_shifted_mc_hists,
                        down_shifted_mc_hists, envelope_as_errors=True)

                    plot.draw_as_graph(envelope, options="2", hatched=True)

                # add category information to plot
                pt_range = category.get_aux("pt", None)
                eta_range = category.get_aux("eta", None)
                if pt_range is not None and eta_range is not None:
                    if not np.isinf(pt_range[1]):
                        text = r"#splitline{%d < p_{T} < %d}{%.1f < |#eta| < %.1f}" % \
                            (pt_range[0], pt_range[1], eta_range[0], eta_range[1])
                    else:
                        text = r"#splitline{p_{T} > %d}{%.1f < |#eta| < %.1f}" % \
                            (pt_range[0], eta_range[0], eta_range[1])
                    plot.draw_text(text, size=0.05, xpos=0.505, ypos=0.5, align=11)

                # ratio of data to mc below the main plot
                plot.cd(0, 0)

                # ratio histograms
                # mc error band
                ratio_mcerr_hist = mc_hist_sum.Clone()
                # divide without error propagation
                self.divide_hists(ratio_mcerr_hist, mc_hist_sum)

                # ratio
                ratio_hist = data_hist.Clone()
                self.divide_hists(ratio_hist, mc_hist_sum)

                y_axis = ratio_hist.GetYaxis()
                y_axis.SetRangeUser(0.5, 1.5)
                y_axis.SetTitle("data/MC")
                y_axis.SetTitleSize(y_axis.GetTitleSize() * plot.open_pad.scale_factor)
                y_axis.SetLabelSize(y_axis.GetLabelSize() * plot.open_pad.scale_factor)
                y_axis.SetNdivisions(505)
                y_axis.SetTitleOffset(0.65)

                x_axis = ratio_hist.GetXaxis()
                if self.x_title:
                    aux = category.aux.copy()
                    aux["b_tag_var"] = self.config_inst.get_aux("btaggers")[self.b_tagger]["label"]
                    x_axis.SetTitle(self.x_title.format(**aux))

                x_axis.SetTitleSize(x_axis.GetTitleSize() * plot.open_pad.scale_factor)
                x_axis.SetLabelSize(x_axis.GetLabelSize() * plot.open_pad.scale_factor)

                plot.draw({"invis": ratio_hist}, invis=True)
                plot.draw_as_graph(ratio_mcerr_hist, options="2")
                plot.draw({"data/mc": ratio_hist})

                if self.draw_systematics:
                    # build envelope of ratio to nominal hist
                    for hist in up_shifted_mc_hists.values():
                        hist.Divide(mc_hist_sum)
                    for hist in down_shifted_mc_hists.values():
                        hist.Divide(mc_hist_sum)
                    scaled_envelope = build_hist_envelope(ratio_mcerr_hist, up_shifted_mc_hists,
                        down_shifted_mc_hists, envelope_as_errors=True)
                    plot.draw_as_graph(scaled_envelope, options="2", hatched=True)

        for category, plot in plot_dict.items():
            plot_name = self.get_plot_name(category.name, self.variable, self.b_tagger, self.iteration)
            plot.save(os.path.join(local_tmp.path, plot_name),
                draw_legend=(False, True), log_y=self.logarithmic,
                lumi=self.config_inst.get_aux("lumi").values()[0]/1000.)
            del plot

        with outp.localize("w") as tmp:
            with tarfile.open(tmp.path, "w:gz") as tar:
                for plot_file in os.listdir(local_tmp.path):
                    tar.add(os.path.join(local_tmp.path, plot_file), arcname=plot_file)
Esempio n. 17
0
 def output(self):
     return LocalDirectoryTarget(self.path)