Beispiel #1
0
def create_input_root_files(datacards, args):
    ''' Configuring Harry plotter according to the samples and creating input root files according to the args.'''
    plot_configs = []
    output_files = []
    merged_output_files = []
    hadd_commands = []

    sample_settings = samples.Samples()
    expression_settings = expressions.ExpressionsDict()
    binnings_settings = binnings.BinningsDict()
    systematics_factory = systematics.SystematicsFactory()

    datacards.configs._mapping_process2sample = {
        "data_obs": "data",
        "EWKZ": "ewkz",
        "QCD": "qcd",
        "TT": "ttj",
        "TTT": "ttt",
        "TTJ": "ttjj",
        "VV": "vv",
        "VVT": "vvt",
        "VVJ": "vvj",
        "W": "wj",
        "ZJ": "zj",
        "ZL": "zl",
        "ZLL": "zll",
        "ZTTPOSPOL": "zttpospol",
        "ZTTNEGPOL": "zttnegpol",
        "ZTT_GEN_DM_ZERO": "ztt_gen_dm_zero",
        "ZTT_GEN_DM_ONE": "ztt_gen_dm_one",
        "ZTT_GEN_DM_TWO": "ztt_gen_dm_two",
        "ZTT_GEN_DM_TEN": "ztt_gen_dm_ten",
        "ZTT_GEN_DM_ELEVEN": "ztt_gen_dm_eleven",
    }

    for index, (channel,
                categories) in enumerate(zip(args.channel, args.categories)):

        for category in categories:
            datacards_per_channel_category = datacardsbase.Datacards(
                cb=datacards.cb.cp().channel([channel]).bin([category]))

            higgs_masses = [
                mass for mass in datacards_per_channel_category.cb.mass_set()
                if mass != "*"
            ]

            output_file = os.path.join(
                args.output_dir,
                "input/{ANALYSIS}_{CHANNEL}_{BIN}_{ERA}.root".format(
                    ANALYSIS="ztt", CHANNEL=channel, BIN=category,
                    ERA="13TeV"))
            output_files.append(output_file)
            tmp_output_files = []

            for shape_systematic, list_of_samples in datacards_per_channel_category.get_samples_per_shape_systematic(
            ).iteritems():
                nominal = (shape_systematic == "nominal")
                list_of_samples = [
                    datacards.configs.process2sample(process)
                    for process in list_of_samples
                ]
                if ("wj"
                        in list_of_samples) and not ("qcd" in list_of_samples):
                    list_of_samples.append("qcd")
                elif ("qcd"
                      in list_of_samples) and not ("wj" in list_of_samples):
                    list_of_samples.append("wj")
                asimov_nicks = []
                if args.use_asimov_dataset:
                    asimov_nicks = [
                        nick.replace("zttpospol", "zttpospol_noplot").replace(
                            "zttnegpol", "zttnegpol_noplot")
                        for nick in list_of_samples
                    ]
                    if "data" in asimov_nicks:
                        asimov_nicks.remove("data")

                for shift_up in ([True] if nominal else [True, False]):
                    systematic = "nominal" if nominal else (
                        shape_systematic + ("Up" if shift_up else "Down"))

                    log.debug(
                        "Create inputs for (samples, systematic) = ([\"{samples}\"], {systematic}), (channel, category) = ({channel}, {category})."
                        .format(samples="\", \"".join(list_of_samples),
                                channel=channel,
                                category=category,
                                systematic=systematic))

                    tmp_quantity = args.quantity
                    tmp_omega_version = args.omega_version
                    if args.fixed_variables == "best_choice":
                        if channel in ["tt"]:
                            if category in [
                                    channel + "_" + cat for cat in [
                                        "combined_rho_oneprong",
                                        "combined_oneprong_oneprong"
                                    ]
                            ]:
                                tmp_quantity = "m_vis"
                                tmp_omega_version = None
                            elif category in [
                                    channel + "_" + cat
                                    for cat in ["combined_a1_rho"]
                            ]:
                                tmp_quantity = None
                                tmp_omega_version = None
                            elif category in [
                                    channel + "_" + cat for cat in
                                ["combined_a1_a1", "combined_a1_oneprong"]
                            ]:
                                tmp_quantity = None
                                tmp_omega_version = "BarSvfitM91"
                            elif category in [
                                    channel + "_" + cat
                                    for cat in ["combined_rho_rho", "rho"]
                            ]:
                                tmp_quantity = None
                                tmp_omega_version = "VisibleSvfit"
                        elif channel in ["mt", "et"]:
                            if category in [
                                    channel + "_" + cat
                                    for cat in ["combined_a1_oneprong"]
                            ]:
                                tmp_quantity = "m_vis"
                                tmp_omega_version = None
                            elif category in [
                                    channel + "_" + cat
                                    for cat in ["combined_rho_oneprong"]
                            ]:
                                tmp_quantity = None
                                tmp_omega_version = None
                            elif category in [
                                    channel + "_" + cat for cat in [
                                        "combined_oneprong_oneprong", "a1",
                                        "oneprong"
                                    ]
                            ]:
                                tmp_quantity = None
                                tmp_omega_version = "BarSvfitM91"
                            elif category in [
                                    channel + "_" + cat for cat in ["rho"]
                            ]:
                                tmp_quantity = None
                                tmp_omega_version = "VisibleSvfit"
                        elif channel in ["em"]:
                            if category in [
                                    channel + "_" + cat
                                    for cat in ["combined_oneprong_oneprong"]
                            ]:
                                tmp_quantity = "m_vis"
                                tmp_omega_version = None
                    elif args.fixed_variables == "best_choice_no_svfit":
                        tmp_quantity = "m_vis"
                        tmp_omega_version = None
                        if channel in ["tt", "mt", "et"]:
                            if category in [
                                    channel + "_" + cat
                                    for cat in ["combined_rho_rho", "rho"]
                            ]:
                                tmp_quantity = None
                                tmp_omega_version = "VisibleSvfit"

                    x_expression = None
                    if tmp_quantity:
                        x_expression = tmp_quantity
                    else:
                        x_expression = "testZttPol13TeV_" + category
                        if tmp_omega_version:
                            x_expression = expression_settings.expressions_dict[
                                x_expression].replace("BarSvfit",
                                                      tmp_omega_version)
                    x_expression = expression_settings.expressions_dict.get(
                        x_expression, x_expression)

                    # prepare plotting configs for retrieving the input histograms
                    config = sample_settings.get_config(
                        samples=[
                            getattr(samples.Samples, sample)
                            for sample in list_of_samples
                        ],
                        channel=channel,
                        category="catZttPol13TeV_" + category,
                        weight=args.weight,
                        lumi=args.lumi * 1000,
                        higgs_masses=higgs_masses,
                        estimationMethod="new",
                        polarisation_bias_correction=True,
                        cut_type="low_mvis_smhtt2016",
                        exclude_cuts=(["m_vis"]
                                      if x_expression == "m_vis" else []),
                        no_ewk_samples=args.no_ewk_samples,
                        no_ewkz_as_dy=True,
                        asimov_nicks=asimov_nicks)

                    systematics_settings = systematics_factory.get(
                        shape_systematic)(config)
                    # TODO: evaluate shift from datacards_per_channel_category.cb

                    config = systematics_settings.get_config(shift=(
                        0.0 if nominal else (1.0 if shift_up else -1.0)))

                    #config["qcd_subtract_shape"] =[args.qcd_subtract_shapes]

                    config["x_expressions"] = [
                        ("0" if (("gen_zttpospol" in nick) or
                                 ("gen_zttnegpol" in nick)) else x_expression)
                        for nick in config["nicks"]
                    ]

                    binnings_key = "binningZttPol13TeV_" + category + "_" + x_expression
                    if not (binnings_key in binnings_settings.binnings_dict):
                        binnings_key = "binningZttPol13TeV_" + category + (
                            ("_" + tmp_quantity) if tmp_quantity else "")
                    if binnings_key in binnings_settings.binnings_dict:
                        config["x_bins"] = [
                            ("1,-1,1" if
                             (("gen_zttpospol" in nick) or
                              ("gen_zttnegpol" in nick)) else binnings_key)
                            for nick in config["nicks"]
                        ]

                    if args.fixed_binning:
                        if args.fixed_variables:
                            if channel == "tt":
                                config["x_bins"] = [
                                    args.fixed_binning.split(",")[0] +
                                    ",-1.0001,1.0001"
                                    for nick in config["nicks"]
                                ]
                            else:
                                config["x_bins"] = [
                                    args.fixed_binning
                                    for nick in config["nicks"]
                                ]
                        else:
                            config["x_bins"] = [
                                args.fixed_binning for nick in config["nicks"]
                            ]

                    config["directories"] = [args.input_dir]

                    histogram_name_template = "${BIN}/${PROCESS}" if nominal else "${BIN}/${PROCESS}_${SYSTEMATIC}"
                    config["labels"] = [
                        histogram_name_template.replace("$", "").format(
                            PROCESS=datacards.configs.sample2process(
                                sample.replace("asimov", "data")),
                            BIN=category,
                            SYSTEMATIC=systematic)
                        for sample in config["labels"]
                    ]

                    tmp_output_file = os.path.join(
                        args.output_dir,
                        "input/{ANALYSIS}_{CHANNEL}_{BIN}_{SYSTEMATIC}_{ERA}.root"
                        .format(ANALYSIS="ztt",
                                CHANNEL=channel,
                                BIN=category,
                                SYSTEMATIC=systematic,
                                ERA="13TeV"))
                    tmp_output_files.append(tmp_output_file)
                    config["output_dir"] = os.path.dirname(tmp_output_file)
                    config["filename"] = os.path.splitext(
                        os.path.basename(tmp_output_file))[0]

                    config["plot_modules"] = ["ExportRoot"]
                    config["file_mode"] = "UPDATE"

                    if "legend_markers" in config:
                        config.pop("legend_markers")

                    plot_configs.append(config)

            hadd_commands.append("hadd -f {DST} {SRC} && rm {SRC}".format(
                DST=output_file, SRC=" ".join(tmp_output_files)))

    tmp_output_files = list(
        set([
            os.path.join(config["output_dir"], config["filename"] + ".root")
            for config in plot_configs[:args.n_plots[0]]
        ]))
    for output_file in tmp_output_files:
        if os.path.exists(output_file):
            os.remove(output_file)
            log.debug("Removed file \"" + output_file +
                      "\" before it is recreated again.")
    output_files = list(set(output_files))

    higgsplot.HiggsPlotter(list_of_config_dicts=plot_configs,
                           list_of_args_strings=[args.args],
                           n_processes=args.n_processes,
                           n_plots=args.n_plots[0],
                           batch=args.batch)

    if args.n_plots[0] != 0:
        tools.parallelize(_call_command,
                          hadd_commands,
                          n_processes=args.n_processes)

    debug_plot_configs = []
    for output_file in (output_files):
        debug_plot_configs.extend(plotconfigs.PlotConfigs().all_histograms(
            output_file,
            plot_config_template={
                "markers": ["E"],
                "colors": ["#FF0000"]
            }))
    if args.www:
        for debug_plot_config in debug_plot_configs:
            debug_plot_config["www"] = debug_plot_config["output_dir"].replace(
                args.output_dir, args.www)
    #higgsplot.HiggsPlotter(list_of_config_dicts=debug_plot_configs, list_of_args_strings=[args.args], n_processes=args.n_processes, n_plots=args.n_plots[0])

    return None
	parser.add_argument("-n", "--n-processes", type=int, default=1,
						help="Number of (parallel) processes. [Default: %(default)s]")
	parser.add_argument("-p", "--n-plots", type=int,
							help="Number of plots. [Default: all]")
	#parser.add_argument("--calculate-separation", action="store_true", default =False,
							#help="calculate separation using TestSample")
	#parser.add_argument("--no-plot", action="store_true", default =False,
							#help="skip plotting")
	args = parser.parse_args()
	logger.initLogger(args)
	nice_channel = {
		"em": "#scale[1.5]{e#mu ",
		"mt": "#scale[1.5]{#mu#tau_{h} ",
		"et": "#scale[1.5]{e#tau_{h} ",
		"tt": "#scale[1.5]{#tau_{h}#tau_{h} "}
	binnings_dict = import_binnings.BinningsDict()
	input_bdts = {}
	for i,in_dir in enumerate(args.input_dirs):
		input_files = [(dirpath, f) for dirpath, dirnames, files in os.walk(in_dir) for f in fnmatch.filter(files, args.matching)]
		for path, file_name in input_files:
			bdt_name = file_name[:-7] #cut away TX.root
			if not bdt_name in input_bdts.keys():
				input_bdts[bdt_name] = {in_dir: [os.path.join(path, file_name)]}
			elif in_dir not in input_bdts[bdt_name].keys():
				input_bdts[bdt_name][in_dir] = [os.path.join(path, file_name)]
			else:
				input_bdts[bdt_name][in_dir].append(os.path.join(path, file_name))
	configs_list = []
	jsonTools.JsonDict(input_bdts).save("testPlot.json", indent=4)
	for name, plot in input_bdts.iteritems():
		bdt_var = "BDT_"+name
Beispiel #3
0
    # Clean the output dir
    args.output_dir = os.path.abspath(os.path.expandvars(args.output_dir))
    if args.clear_output_dir:
        clear_output_dir = raw_input(
            "Do you really want to clear the output directory? [yes]").lower(
            ) == "yes"
        if not clear_output_dir:
            log.info(
                "Terminate. Remove the clear_output_dir option and run the programm again."
            )
            sys.exit(1)
        logger.subprocessCall("rm -r " + args.output_dir, shell=True)

    sample_settings = samples.Samples()
    binnings_settings = binnings.BinningsDict()
    systematics_factory = systematics.SystematicsFactory()
    www_output_dirs_postfit = []
    www_output_dirs_weightbin = []
    www_output_dirs_parabola = []

    plot_configs = []
    output_files = []
    merged_output_files = []
    hadd_commands = []

    # Initialise directory and naming scheme templates for datacards
    tmp_input_root_filename_template = "input/${ANALYSIS}_${CHANNEL}_${BIN}_${SYSTEMATIC}_${ERA}.root"
    input_root_filename_template = "input/${ANALYSIS}_${CHANNEL}_${ERA}.root"
    bkg_histogram_name_template = "${BIN}/${PROCESS}"
    sig_histogram_name_template = "${BIN}/${PROCESS}"
def calculate_partial_correlation(config):
	channel = config["channel"]
	category = config["category"]
	requested_sample = config["request_nick"]
	#construct list of rootfiles, if prepare samples is enabled, files are produced
	c_tree = ""
	c_tree_list = ROOT.TList()
	root_file_name_list = []
	root_storage_file = "%s/%s.root"%(config["storage_name_extension"], "NTuples")
	config["storage_file"] = root_storage_file
	config["storage_ntuple"]=config["folders"][0].replace("/ntuple", "")

	cuts = ""
	#find all physical files and store them in root_filename_list
	for i,nick in enumerate(config["nicks"]):
		if not bool(sum([x in nick for x in ["wmh", "wph", "zh"]])) and "noplot" in nick:
			continue
		log.debug("search files:\n" + str(config["files"][i]))
		#next line splits file_string into filenames, those could contain * -> use glob.glob to map * to real names, add the list to root_file_name_list
		map(root_file_name_list.__iadd__, map(glob.glob, map(config["root_input_dir"].__add__, config["files"][i].split(" "))))
		if (not cuts == "") and (not cuts == config["weights"][i]):
			log.error("can not decide which weight to use for sample %s nick %s" %(config["request_nick"],nick))
			log.error(config)
			#sys.exit()
		cuts = config["weights"][i]
		config["lumi"] = config["scale_factors"][i]

	if config["prepare_samples"] and not (os.path.exists(root_storage_file) and not config["overwrite_samples"]):
		log.info("Recreate Sample %s"%root_storage_file)
		for root_file_name in root_file_name_list:
			log.debug("Prepare Rootfile %s as Sample %s" %(root_file_name, config["request_nick"]))
			c_tree_list.append(ROOT.TChain())
			root_file_name = root_file_name + '/' + config["folders"][0]
			c_tree_list[-1].Add(root_file_name)
			c_tree_list[-1].SetName("list_tree")

		store_file = ROOT.TFile(root_storage_file, "RECREATE")
		selection_string = cuts.replace("eventWeight*", "")
		if config["request_nick"] in ["ztt", "zll"]:
			selection_string = cuts.replace("eventWeight*", "").replace("*stitchWeight%s"%(config["request_nick"].upper()), "")
		for index in range(len(c_tree_list)):
			log.debug("Cut Tree %s for Sample %s "%(root_file_name_list[index], root_storage_file))
			c_tree_list[index]=c_tree_list[index].CopyTree(selection_string)
		log.debug("Merge Trees for Sample %s "%root_storage_file)
		if len(c_tree_list) > 1:
			c_tree = ROOT.TTree.MergeTrees(c_tree_list)
		elif len(c_tree_list) == 1:
			c_tree = c_tree_list[0]
		else:
			c_tree =ROOT.TChain()
		log.debug("Prepare Sample %s from %i files"%(root_storage_file,len(c_tree_list)))
		c_tree.SetName(config["folders"][0].replace("/ntuple", ""))
		c_tree.Write()
		for i in range(len(c_tree_list)):
			del c_tree_list[0]
		del c_tree_list
		#store_file.Write()
		store_file.Close()
	#log.error("Reached Breakpoint, would start calculations next!")
	#sys.exit()
	root_histograms = {}
	corr_vars = {}
	binnings_dict = import_binnings.BinningsDict()
	nick_path = os.path.join(config["output_dir_path"], channel, category, requested_sample)
	log.debug("save output to folder %s"%nick_path)
	if not os.path.exists(nick_path):
		os.makedirs(nick_path)
	root_inf = ROOT.TFile(config["storage_file"], "read")
	root_inst = root_inf.Get(config["storage_ntuple"])
	log.debug("===============================X-Y Correlations==============================")
	for variables in itertools.combinations(config["parameters_list"], 2):
		bins_raw = [binnings_dict.get_binning("%s_"%config["channel"]+variables[0]).strip(),
					binnings_dict.get_binning("%s_"%config["channel"]+variables[1]).strip()]
		for i, bins in enumerate(bins_raw):
			if " " in bins:
				tmp_bins = bins.split(" ")
				bins_raw[i] = [10,float(tmp_bins[0]),float(tmp_bins[-1])]
			elif "," in bins:
				tmp_bins = bins.split(",")
				bins_raw[i] = [10,float(tmp_bins[1]),float(tmp_bins[-1])]
			else:
				bins_raw[i] = [10,0.,250]


		[xbins, ybins] = bins_raw
		root_histograms["+-+".join(variables)] = ROOT.TProfile("+-+".join(variables),
													"correlation: %s vs %s"%variables, *(xbins+ybins[1:]+[ 's']))
		log.debug("Generate RootHistogram TH2F: %s"%("+-+".join(variables)))
		log.debug("options: " + str(xbins+ybins[1:]+[ 's']))
		root_histograms["+-+".join(variables[-1::-1])] = ROOT.TProfile("+-+".join(variables[-1::-1]),
													"correlation: %s vs %s"%(variables[-1::-1]), *(ybins+xbins[1:]+[ 's']))
		log.debug("Generate RootHistogram TH2F: %s"%("+-+".join(variables[-1::-1])))
		log.debug("options: " + str(ybins+xbins[1:]+[ 's']))
		corr_vars["+-+".join(variables)] = 0
		root_histograms["+-+".join(variables)].SetDirectory(0)
		ROOT.SetOwnership (root_histograms["+-+".join(variables)], False)
		root_histograms["+-+".join(variables[-1::-1])].SetDirectory(0)
		ROOT.SetOwnership (root_histograms["+-+".join(variables[-1::-1])], False)

	log.debug("===============================X-X Correlations==============================")
	for variable in config["parameters_list"]:
		bins = binnings_dict.get_binning("%s_"%config["channel"]+variable).strip()
		bins_raw = []
		if " " in bins:
			tmp_bins = bins.split(" ")
			bins_raw = [10,float(tmp_bins[0]),float(tmp_bins[-1])]
		elif "," in bins:
			tmp_bins = bins.split(",")
			bins_raw = [10,float(tmp_bins[1]),float(tmp_bins[-1])]
		else:
			bins_raw = [10,0.,250]

		log.debug("Generate RootHistogram TH2F: %s"%("+-+".join([variable,variable])))
		log.debug(bins_raw*2)
		root_histograms["+-+".join([variable,variable])] = ROOT.TProfile("+-+".join([variable,variable]),
													"correlation: %s vs %s"%(variable,variable), *(bins_raw+bins_raw[1:]+[ 's']))
		corr_vars["+-+".join([variable,variable])] = 0
		corr_vars[variable] = 0
		corr_vars["var_%s"%variable] = 0
		root_histograms["+-+".join([variable,variable])].SetDirectory(0)
		ROOT.SetOwnership (root_histograms["+-+".join([variable,variable])], False)
	log.info( "=================================================================================")
	log.info( "Calculate correlations in sample %s and make scatter plots for %i variable pairs."%(config["request_nick"],len(root_histograms)))
	log.info( "=================================================================================")
	#sys.exit()
	i = 0.
	n = 0
	zero_vals = {}
	lumi_val = config["lumi"]
	for event in root_inst:
		calced_means = []
		w = event.__getattr__(config["weight_variable"]) * lumi_val
		if config["request_nick"] in ["ztt", "zll", "wj"]:
			w *= event.__getattr__("stitchWeight%s"%(config["request_nick"].upper()))
		for varxy in corr_vars.iterkeys():
			if not "+-+" in varxy:
				continue
			varx, vary = map(str, varxy.split("+-+"))
			x, y = map(event.__getattr__, map(str, varxy.split("+-+")))
			root_histograms["+-+".join([varx, vary])].Fill(x, y, w)
			root_histograms["+-+".join([vary, varx])].Fill(y, x, w)


			if varx not in zero_vals:
				zero_vals[varx] = 0
			if vary not in zero_vals:
				zero_vals[vary] = 0

			if varx not in calced_means:
				#log.info( "calculate mean for %s" %varx)
				corr_vars[varx] += w*(x -zero_vals[varx])
				corr_vars["var_%s"%varx] += w*(x -zero_vals[varx])**2
				calced_means.append(varx)
			if vary not in calced_means:
				#log.info( "calculate mean for %s" %vary)
				corr_vars[vary] += w*(y - zero_vals[vary])
				corr_vars["var_%s"%vary] += w*(y - zero_vals[vary])**2
				calced_means.append(vary)
			corr_vars[varxy] += w*(x -zero_vals[varx]) * (y - zero_vals[vary])

		i += w
		n += 1
		if n%1000 == 0:
			log.info( "processed: %i events"%n)

	hist_file = ROOT.TFile(os.path.join(nick_path, "Histograms.root"),"RECREATE")
	for varxy in root_histograms.iterkeys():
		root_histograms[varxy].Write()
	corr_vars["weight_sum"] = i
	corr_vars["n"] = n
	hist_file.Close()
	root_inf.Close()
	config["correlations"] = corr_vars
	jsonTools.JsonDict(config).save(os.path.join(nick_path,"Correlations.json"),indent=4)