コード例 #1
0
    parameters["hist_vars"] += ["score_" + m for m in parameters["bdt_models"]]

    # parameters['plot_vars'] = ['dimuon_mass']
    parameters["plot_vars"] = parameters["hist_vars"]
    parameters["datasets"] = datasets

    all_paths = {}
    for year in parameters["years"]:
        all_paths[year] = {}
        for dataset in datasets:
            paths = glob.glob(
                f"{parameters['path']}/"
                f"{year}_{parameters['label']}/"
                f"{dataset}/*.parquet"
            )
            all_paths[year][dataset] = paths

    if args.remake_hists:
        for year in parameters["years"]:
            print(f"Processing {year}")
            for dataset, path in tqdm.tqdm(all_paths[year].items()):
                if len(path) == 0:
                    continue
                df = load_dataframe(client, parameters, inputs=[path])
                if not isinstance(df, dd.DataFrame):
                    continue
                to_histograms(client, parameters, df=df)

    if args.plot:
        yields = plotter(client, parameters)
コード例 #2
0
    parameters["datasets"] = datasets

    all_paths = {}
    for year in parameters["years"]:
        all_paths[year] = {}
        for dataset in datasets:
            paths = glob.glob(f"{parameters['path']}/"
                              f"{year}_{parameters['label']}/"
                              f"{dataset}/*.parquet")
            all_paths[year][dataset] = paths

    if args.remake_hists:
        dfs = []
        for path in tqdm.tqdm(all_paths):
            if len(path) == 0:
                continue
            df = load_dataframe(client, parameters, inputs=[path])
            dfs.append(df)
            # to_histograms(client, parameters, df=df)

        df = pd.concat(dfs)
        df.reset_index(inplace=True, drop=True)
        run_mva(client, parameters, df)

        # scores = {k: "test_adv_score" for k in parameters["mva_channels"]}
        # categorize_by_score(df, scores)
        # print(df[["channel", "category"]])

    if args.plot:
        plotter(client, parameters)
コード例 #3
0
        "samp_info": samp_info,
        "do_timer": False,
        "do_btag_syst": False
    }

    executor = DaskExecutor(**executor_args)
    run = Runner(executor=executor, schema=NanoAODSchema, chunksize=10000)
    out_df = run(
        samp_info.fileset,
        "Events",
        processor_instance=DimuonProcessor(**processor_args),
    )

    df = load_dataframe(client, parameters, inputs=out_df)
    out_hist = to_histograms(client, parameters, df=df)
    out_plot = plotter(client, parameters, hist_df=out_hist)

    elapsed = round(time.time() - tick, 3)
    print(f"Finished everything in {elapsed} s.")

    out_df = out_df.compute()
    dimuon_mass = out_df.loc[out_df.event == 2, "dimuon_mass"].values[0]
    jj_mass = out_df.loc[out_df.event == 2, "jj_mass nominal"].values[0]

    assert out_df.shape == (21806, 116)
    assert almost_equal(dimuon_mass, 124.16069531)
    assert almost_equal(jj_mass, 1478.3898375)

    slicer = {
        "region": "h-peak",
        "channel": "vbf",
コード例 #4
0
ファイル: trainer.py プロジェクト: ArnabPurohit/hmumu-coffea
def run_mva(client, parameters, df):
    mva_path = parameters.pop("mva_path", "./")
    mkdir(mva_path)
    mva_models = parameters.pop("mva_models", {})
    saved_models = parameters.pop("saved_models", {})
    training_datasets = parameters.pop("training_datasets", {})
    features = parameters.pop("training_features", [])
    do_training = parameters.pop("mva_do_training", False)
    do_evaluation = parameters.pop("mva_do_evaluation", False)
    do_plotting = parameters.pop("mva_do_plotting", False)
    channels_to_use = parameters.get("mva_channels", ["ggh_0jets"])

    for channel in channels_to_use:
        out_dir = f"{mva_path}/{channel}"
        mkdir(out_dir)
        parameters["plots_path"] = out_dir

        trainer = Trainer(
            df=df[df.channel == channel],
            channel=channel,
            ds_dict=training_datasets,
            features=features,
            out_path=out_dir,
            training_cut="(dimuon_mass > 110) & (dimuon_mass < 150)",
        )
        # trainer.shape_in_eta_bins(shape_of="dimuon_mass", nbins=10)

        if do_training:
            trainer.add_models(mva_models.copy())
            trainer.run_training(client)

        if channel in saved_models.keys():
            if len(saved_models[channel].keys()) > 0:
                trainer.add_saved_models(saved_models[channel])

        if do_evaluation:
            trainer.run_evaluation(client)
            trainer.shape_in_bins(shape_of="dimuon_mass", nbins=6)
            # trainer.shape_in_bins(shape_of="max_abs_eta", nbins=10)

            for model_name in trainer.models.keys():
                score_name = f"{model_name}_score"
                df.loc[df.channel == channel, score_name] = trainer.df.loc[
                    :, score_name
                ]

        if do_plotting:
            trainer.plot_roc_curves()
            parameters_tmp = parameters.copy()
            parameters_tmp["hist_vars"] = []
            parameters_tmp["plot_vars"] = []
            parameters_tmp["regions"] = ["h-peak", "h-sidebands"]
            parameters_tmp["channels"] = [channel]
            all_models = []
            if channel in mva_models.keys():
                all_models += list(mva_models[channel].keys())
            if channel in saved_models.keys():
                all_models += list(saved_models[channel].keys())
            all_models = list(set(all_models))
            for model_name in all_models:
                score_name = f"{model_name}_score"
                parameters_tmp["hist_vars"].append(score_name)
                parameters_tmp["plot_vars"].append(score_name)
                parameters_tmp["variables_lookup"][score_name] = Variable(
                    score_name, score_name, 50, 0, 1
                )

            hist_df = to_histograms(client, parameters_tmp, trainer.df)
            plotter(client, parameters_tmp, hist_df)