Exemple #1
0
def plot_comparison():

    extra_vars = [gcm().ltime_var]

    # Current mode stuff
    data = gcm().get_data([f.var for f in extra_vars])
    add_variables.append_phsp(data)
    df_sel = final_selection.get_final_selection()
    df_sel &= selection.mass_signal_region()

    gen = get_model()

    outfile = gcm().get_output_path('effs') + 'Gen_DATA_Comp.pdf'
    with PdfPages(outfile) as pdf:
        for pc in gcm().phsp_vars + extra_vars:
            log.info('Plotting {}'.format(pc.var))
            filled = gen[pc.var]
            errorbars = data[pc.var][df_sel]
            if pc.convert is not None:
                filled = pc.convert(filled)
                errorbars = pc.convert(errorbars)
            ax = comparison.plot_comparison(pc, filled, errorbars, 'Model',
                                            'Data')
            ax.set_xlabel(pc.xlabel)
            ax.yaxis.set_visible(False)
            ax.legend()
            pdf.savefig(plt.gcf())
Exemple #2
0
def train_reweighter():
    extra_vars = [
        gcm().ltime_var
    ]
    all_vars = gcm().phsp_vars + extra_vars
    columns = [v.var for v in all_vars if 'phi' not in v.var]
    columns += ['cosphi', 'sinphi']

    # Current mode stuff
    data = gcm().get_data([f.var for f in extra_vars])
    add_variables.append_phsp(data)
    data['cosphi'] = np.cos(data.phi1)
    data['sinphi'] = np.sin(data.phi1)
    df_sel = final_selection.get_final_selection()
    df_sel &= selection.delta_mass_signal_region()

    gen = get_model()
    gen['cosphi'] = np.cos(gen.phi1)
    gen['sinphi'] = np.sin(gen.phi1)

    limits = {v.var: v.binning[1:] for v in all_vars}
    limits['cosphi'] = (-1., 1)
    limits['sinphi'] = (-1., 1)
    for c in columns:
        mi, ma = limits[c]
        data[c] = (data[c] - mi) / (ma - mi) + 2.
        gen[c] = (gen[c] - mi) / (ma - mi) + 2.

    log.info('Training BDT reweighter for {}'.format(', '.join(columns)))
    reweighter = GBReweighter(n_estimators=300, max_depth=5, learning_rate=0.2)

    reweighter.fit(original=gen[columns].sample(n=250000),
                   target=data[columns][df_sel].sample(n=250000))
    bdt_utils.dump_reweighter(reweighter)
Exemple #3
0
def lifetime_study(correct_efficiencies=False):
    # Current mode stuff
    data = gcm().get_data([gcm().ltime_var.var])
    add_variables.append_phsp(data)
    df_sel = final_selection.get_final_selection()
    df_sel &= selection.delta_mass_signal_region()
    data['weight'] = 1.

    if correct_efficiencies:
        outfile = gcm().get_output_path('effs') + 'DATA_ltime_dep_effs.pdf'
    else:
        outfile = gcm().get_output_path('effs') + 'DATA_ltime_dep.pdf'
    percentiles = np.arange(0, 1.1, 0.2)
    boundaries = helpers.weighted_quantile(
        data[gcm().ltime_var.var][df_sel], percentiles)
    if correct_efficiencies:
        data['weight'] = 1./get_efficiency()
        boundaries = boundaries[1:]
    with PdfPages(outfile) as pdf:
        for var in gcm().phsp_vars:
            fig, ax = plt.subplots(figsize=(10, 10))
            for low, high in zip(boundaries[:-1], boundaries[1:]):
                sel = (data[gcm().ltime_var.var] > low) & (data[gcm().ltime_var.var] < high)  # NOQA

                df = data[var.var][df_sel & sel]
                weight = data['weight'][df_sel & sel]

                rlow, prec = helpers.rounder(low*1000, [low*1000, high*1000])
                rhigh, _ = helpers.rounder(high*1000, [low*1000, high*1000])

                spec = '{{:.{}f}}'.format(prec)
                label = r'${} < \tau \mathrm{{ [ps]}}  < {}$'.format(
                    spec.format(rlow), spec.format(rhigh))

                values, edges = np.histogram(df, bins=int(var.binning[0]/5.), range=var.binning[1:], weights=weight)  # NOQA
                err, edges = np.histogram(df, bins=int(var.binning[0]/5.), range=var.binning[1:], weights=weight**2)  # NOQA
                norm = np.sum(values)
                values = values/norm
                err = np.sqrt(err)/norm
                x_ctr = (edges[1:] + edges[:-1])/2.
                width = (edges[1:] - edges[:-1])
                x_err = width/2.

                options = dict(
                    fmt='o', markersize=5, capthick=1, capsize=0, elinewidth=2,
                    alpha=1)

                ax.errorbar(x_ctr, values, err, x_err, label=label, **options)
            ax.set_xlabel(var.xlabel)
            ax.yaxis.set_visible(False)
            ax.legend()
            pdf.savefig(plt.gcf())
            plt.close()
Exemple #4
0
def get_efficiency():
    """Returns or first trains the BDT efficiency."""
    extra_vars = [
        gcm().ltime_var
    ]
    all_vars = gcm().phsp_vars + extra_vars
    columns = [v.var for v in all_vars if 'phi' not in v.var]
    columns += ['cosphi', 'sinphi']
    log.info('Getting efficiencies for {}'.format(', '.join(columns)))

    # Current mode stuff
    data = gcm().get_data([f.var for f in extra_vars])
    add_variables.append_phsp(data)

    data['cosphi'] = np.cos(data.phi1)
    data['sinphi'] = np.sin(data.phi1)
    return compute_efficiency(data)
Exemple #5
0
def simple_phsp_efficiencies():

    extra_vars = [
        gcm().ltime_var
    ]

    # Current mode stuff
    data = gcm().get_data([f.var for f in extra_vars])
    add_variables.append_phsp(data)
    df_sel = final_selection.get_final_selection()
    df_sel &= selection.delta_mass_signal_region()

    gen = get_model()

    outfile = gcm().get_output_path('effs') + 'Gen_DATA_Eff.pdf'
    with PdfPages(outfile) as pdf:
        for pc in gcm().phsp_vars + extra_vars:
            log.info('Plotting {}'.format(pc.var))
            denominator = gen[pc.var]
            numerator = data[pc.var][df_sel]
            weight_d = np.ones(denominator.index.size)*1./denominator.index.size  # NOQA
            weight_n = np.ones(numerator.index.size)*1./numerator.index.size
            fig, ax = plt.subplots(figsize=(10, 10))
            if pc.convert is not None:
                numerator = pc.convert(numerator)
                denominator = pc.convert(denominator)
            x, y, x_err, y_err = helpers.make_efficiency(
                numerator, denominator, 100, weight_n, weight_d, independent=True)  # NOQA
            options = dict(
                fmt='o', markersize=5, capthick=1, capsize=0, elinewidth=2,
                alpha=1)

            ax.errorbar(x, y, y_err, x_err, **options)
            ax.set_xlabel(pc.xlabel)
            ax.set_ylabel('Relative efficiency')
            pdf.savefig(plt.gcf())
            plt.close()
Exemple #6
0
def get_bdt_data(sw=False,
                 sklearn=True,
                 same_weight=False,
                 comb_data=False,
                 plot=False):
    """Returns the data for the bdt training, containing all the necessary
    variables and weights.

    :sw: Use sweights instead of sidebands and signal region
    :sklearn: return sklearn compatible dataframe
    :same_weight: Changes the weights to have identical normalisation
    :comb_data: Return BDT for comb_data, rand slow pion otherwise
    :returns: if sklearn is True:  DataFrame
              if sklearn is False: sig_df, bkg_df, sig_wgt, bkg_wgt
    """
    if comb_data:
        log.info('Doing returning data for combinatorial background')
        bdt_vars = gcm().comb_bkg_bdt_vars[:]
        bkg_sel = selection.comb_bkg_sideband_region()
    else:
        log.info('Doing returning data for random slow pion background')
        bdt_vars = gcm().rand_spi_bdt_vars[:]
        bkg_sel = selection.rand_spi_sideband_region()
    bdt_vars += gcm().spectator_vars
    # Only add the variables for plotting if needed
    if plot:
        bdt_vars += gcm().just_plot

    df = gcm().get_data(
        [v.var for v in bdt_vars if v.functor.additional is False])
    add_variables.append_angle(df)
    sel = selection.full_selection()
    add_variables.append_phsp(df)
    add_variables.append_dtf_ip_diff(df)

    for f in bdt_vars:
        if f.convert is not None:
            df[f.var] = f.convert(df[f.var])

    if sw:
        from analysis.mass_fitting import get_sweights
        sweights = get_sweights(gcm())

        np.random.seed(42)
        df['labels'] = (np.random.rand(df.index.size) < 0.5).astype(np.int)
        sweights['bkg'] = sweights.rnd + sweights.comb
        df['weights'] = (df['labels'] == 1) * sweights.sig
        df['weights'] += (df['labels'] == 0) * (
            sweights['rnd'] + sweights['comb'])  # NOQA

        # Weights are only present for those rows that are selected,
        # so we select
        df = df.loc[~np.isnan(df['weights'])]
    else:
        df['weights'] = np.ones(df.index.size)
        df.loc[selection.mass_signal_region() & sel, 'labels'] = 1
        df.loc[bkg_sel & sel, 'labels'] = 0

    if config.add_wrongsign and gcm().mode not in config.wrong_sign_modes:
        with opposite_mode():
            df_op = get_bdt_data(sw, sklearn=True)
        df = df.append(df_op.query('labels == 0'), ignore_index=True)

    tot0 = np.sum(df.query('labels == 0').weights)
    # Reduce the statistics of the signal to no more than 10 times the
    # background
    tot1 = np.sum(df.query('labels == 1').weights)
    if 5. * tot0 < tot1:
        sig_sel = df['labels'] == 1
        bkg_sel = df['labels'] == 0
        tot1_max = 5 * tot0
        log.info('Changing signal events {} ---> {}'.format(tot1, tot1_max))

        sel = df.index.isin(
            df.query('labels==1').sample(int(tot1_max), random_state=45).index)
        df.loc[sig_sel & sel, 'keep'] = 1
        df.loc[bkg_sel, 'keep'] = 1
        log.info('DataFrame content before: {}'.format(df.index.size))
        if same_weight:
            # Weight the label 1 sample to have same total than label 0
            df.loc[df['labels'] == 1, 'weights'] = float(tot0) / tot1_max
        df = df.loc[~np.isnan(df['labels']) & ~np.isnan(df['keep'])]
        log.info('DataFrame content after: {}'.format(df.index.size))
    else:
        tot1_max = tot1
        if same_weight:
            # Weight the label 1 sample to have same total than label 0
            df.loc[df['labels'] == 1, 'weights'] = float(tot0) / tot1_max
        df = df.loc[~np.isnan(df['labels'])]

    if sklearn:
        return df

    return (df.query('labels == 1'), df.query('labels == 0'),
            df.query('labels == 1').weights, df.query('labels == 0').weights)
Exemple #7
0
def phsp_comparison_plots():
    """Plots the mode sidebands and the opposite mode signal region phsp
    distributions. Only really meaningful if executed for the WS events.
    Opposite mode is plotted as solid, with the uncertainty propagated to
    the mode error plot.
    """
    # Beside phase space, also plot D0 momentum and flight distance
    extra_vars = [
        gcm().ltime_var,
        PlotConfig(vars.pt,
                   gcm().D0, (100, 0, 15000)),
        PlotConfig(vars.vdchi2,
                   gcm().D0, (100, 0, 10), np.log,
                   r'$\ln(\text{{{}}})$'),  # NOQA
    ]
    # opposite_mode
    with opposite_mode():
        OS = gcm().get_data([f.var for f in extra_vars])
        add_variables.append_phsp(OS)
        os_sel = final_selection.get_final_selection()
        os_sel &= selection.delta_mass_signal_region()

        OS_weight = erf(OS[gcm().ltime_var.var] * 1600) / 24. + 0.038 + OS[
            gcm().ltime_var.var] * 4  # NOQA

    # Current mode stuff
    DF = gcm().get_data([f.var for f in extra_vars])
    add_variables.append_phsp(DF)
    df_sel = final_selection.get_final_selection()
    df_sel &= selection.mass_sideband_region()

    outfile = gcm().get_output_path('selection') + 'phsp_comp.pdf'
    with PdfPages(outfile) as pdf:
        for pc in gcm().phsp_vars + extra_vars:
            log.info('Plotting {}'.format(pc.var))
            filled = OS[pc.var][os_sel]
            filled_weights = OS_weight[os_sel]
            errorbars = DF[pc.var][df_sel]
            if pc.convert is not None:
                filled = pc.convert(filled)
                errorbars = pc.convert(errorbars)
            ax = comparison.plot_comparison(pc,
                                            filled,
                                            errorbars,
                                            'RS signal',
                                            'WS background',
                                            normed_max=True)
            ax.set_xlabel(pc.xlabel)
            plot_utils.y_margin_scaler(ax, lf=0, la=True)
            ax.set_ylabel('Arbitrary units')
            ax.legend()
            pdf.savefig(plt.gcf())
            plt.clf()
            ax = comparison.plot_comparison(pc,
                                            filled,
                                            errorbars,
                                            'RS signal',
                                            'WS background',
                                            filled_weight=filled_weights,
                                            normed_max=True)
            ax.set_xlabel(pc.xlabel)
            plot_utils.y_margin_scaler(ax, lf=0, la=True)
            ax.set_ylabel('Arbitrary units')
            ax.legend()
            pdf.savefig(plt.gcf())
Exemple #8
0
def dependence_study(use_efficiencies=False):

    extra_vars = [
        gcm().ltime_var
    ]
    all_vars = gcm().phsp_vars + extra_vars

    # Current mode stuff
    data = gcm().get_data([f.var for f in extra_vars])
    add_variables.append_phsp(data)
    df_sel = final_selection.get_final_selection()
    df_sel &= selection.delta_mass_signal_region()

    gen = get_model()

    if use_efficiencies:
        outfile = gcm().get_output_path('effs') + 'Gen_DATA_Eff_dep_eff.pdf'
        gen['weight'] = get_efficiency_gen()
    else:
        outfile = gcm().get_output_path('effs') + 'Gen_DATA_Eff_dep.pdf'
        gen['weight'] = 1.

    lim_file = gcm().get_output_path('effs') + 'limits_for_eff.p'
    with PdfPages(outfile) as pdf:
        for selected, plotted in permutations(all_vars, 2):
            log.info('Plotting {} in intervals of {}'.format(
                plotted.var, selected.var))
            percentiles = np.arange(0, 1.1, 0.2)
            boundaries = helpers.weighted_quantile(
                data[selected.var][df_sel], percentiles)
            fig, ax = plt.subplots(figsize=(10, 10))
            for low, high in zip(boundaries[:-1], boundaries[1:]):
                num_sel = (data[selected.var] > low) & (data[selected.var] < high)  # NOQA
                den_sel = (gen[selected.var] > low) & (gen[selected.var] < high)

                denominator = gen[plotted.var][den_sel]
                numerator = data[plotted.var][df_sel & num_sel]

                weight_d = gen['weight'][den_sel]
                weight_d /= np.sum(weight_d)
                weight_n = np.ones(numerator.index.size)*1./numerator.index.size  # NOQA

                x, y, x_err, y_err = helpers.make_efficiency(
                    numerator, denominator, 50, weight_n, weight_d, independent=True)  # NOQA
                options = dict(
                    fmt='o', markersize=5, capthick=1, capsize=0, elinewidth=2,
                    alpha=1)

                rlow, prec = helpers.rounder(low, boundaries)
                rhigh, _ = helpers.rounder(high, boundaries)

                spec = '{{:.{}f}}'.format(prec)
                label = r'${} <$ {} $ < {}$'.format(
                    spec.format(rlow), selected.xlabel, spec.format(rhigh))

                ax.errorbar(x, y, y_err, x_err, label=label, **options)
            ax.set_xlabel(plotted.xlabel)
            ax.set_ylabel('Relative efficiency')
            try:
                limits = load(lim_file)
            except:
                log.info('Creating new limits file')
                limits = {}
            if limits is None:
                log.info('Creating new limits file')
                limits = {}

            if (plotted.var, selected.var) not in limits or use_efficiencies is False:  # NOQA
                plot_utils.y_margin_scaler(ax, hf=0.4)
                limits[(plotted.var, selected.var)] = ax.get_ylim()
            else:
                log.info('Applying limits')
                lim = limits[(plotted.var, selected.var)]
                ax.set_ylim(lim)
            dump(limits, lim_file)
            ax.legend()
            pdf.savefig(plt.gcf())
            plt.close()
Exemple #9
0
def correlations(comb_bkg=False):
    sns.set(style="white")

    if comb_bkg:
        features_config = gcm().comb_bkg_bdt_vars
        bdt_folder = 'bdt_comb_bkg'
        bkg_sel = selection.comb_bkg_sideband_region()
    else:
        features_config = gcm().rand_spi_bdt_vars
        bdt_folder = 'bdt_rand_spi'
        bkg_sel = selection.rand_spi_sideband_region()

    functors = set()
    for pc in features_config:
        functors.add((pc.functor, pc.particle))

    functors.add((m, gcm().D0))
    functors.add((dtf_dm, None))

    varlist = [f(p) for f, p in functors]
    nlist = [f.latex(p) for f, p in functors]

    df = gcm().get_data([i for i in varlist if 'angle' not in i])

    for pc in gcm().phsp_vars:
        functors.add((pc.functor, pc.particle))
    varlist = [f(p) for f, p in functors]
    nlist = [f.latex(p) for f, p in functors]

    sel = selection.full_selection()
    add_variables.append_angle(df)
    add_variables.append_phsp(df)
    df = df[sel]
    signal_sel = selection.mass_signal_region()
    suffix = ['sig', 'bkg']
    for s, n in zip([signal_sel, bkg_sel], suffix):

        correlations = df.corr()
        correlations_array = np.asarray(df.corr())

        row_linkage = hierarchy.linkage(correlations_array, method='average')

        from scipy.cluster.hierarchy import fcluster
        clusters = fcluster(row_linkage, 10, criterion='maxclust')

        clustered = list(
            next(zip(*sorted(zip(varlist, clusters), key=lambda x: x[1]))))
        clustered_names = list(
            next(zip(*sorted(zip(nlist, clusters), key=lambda x: x[1]))))
        correlations = correlations[clustered].loc[clustered] * 100

        f, ax = plt.subplots(figsize=(15, 15))
        mask = np.zeros_like(correlations, dtype=np.bool)
        mask[np.triu_indices_from(mask)] = True

        bla = sns.heatmap(correlations,
                          mask=mask,
                          annot=True,
                          ax=ax,
                          vmin=-100,
                          square=True,
                          vmax=100,
                          fmt="+2.0f",
                          linewidths=.8,
                          yticklabels=clustered_names[1:],
                          xticklabels=clustered_names[:-1],
                          cbar=False)
        bla.set_xticklabels(bla.get_xticklabels(), rotation=90)
        bla.set_yticklabels(bla.get_yticklabels(), rotation=0)

        fn = 'correlations_{}.pdf'.format(n)

        outfile = gcm().get_output_path(bdt_folder) + fn

        bla.get_figure().savefig(outfile)